
/*---------------------------------------------------------------------------*\
 *                     EXTRACT WORD LIST FROM WORD'S ACL FILES               *
 *---------------------------------------------------------------------------* 
 *  Author: Gabriel Zachmann
 *
 *  Aug 1998
 *  Sep 2001 - compiles under Linux, reads Windows-XP ACL files, too
 *---------------------------------------------------------------------------* 
 *  Module Description:                                                      *
 *    Write each pair in the autocorrect list (given as input acl file)      *
 *    to stdout in the form "iab left-word right-word".                      *
 *    Small words and words containing characters > 7F are skipped.          *
 *    Left-words with non-alphanumerical characters are skipped, too.        *
 *                                                                           *
 *  Implementation Issues:                                                   *
 *    There are no parameters (except for the input file).                   *
 *    It has been tested with word6 .acl files, I have no idea, yet,         *
 *    if it works with word7 or word98!                                      *
 *    There are a few hard-wired offsets and sentinels (ugh).                *
 *                                                                           *
 *    Under Windows XP, there seem to be other things in the ACL files,      *
 *    too, like abbreviations, but without a replacement string.             *
 *                                                                           *
 *    Words must not be longer than 255 characters.                          *
 *                                                                           *
 *    gcc -ansi -Wall -W -D_GNU_SOURCE -o extract-acl extract-acl.c          *
 *                                                                           *
 *  Flags:                                                                   *
 *    XP - define this, if your ACL files are from WindowsXP                 *
 *                                                                           *
 *                                                                           *
\*---------------------------------------------------------------------------*/


/*---------------------------------------------------------------------------*\
 *  Includes and Defines                                                     *
\*---------------------------------------------------------------------------*/


#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <ctype.h>


/*---------------------------------------------------------------------------*\
 *   Defines                                                                 *
\*---------------------------------------------------------------------------*/

#define XP

#ifdef XP
#	define MULT 2
#	define HEADER_LEN 84
#else
#	define HEADER_LEN 52
#endif


/*---------------------------------------------------------------------------*\
 *   Variables                                                               *
\*---------------------------------------------------------------------------*/


int infile;
char *filenam;
#define buflen 100*MULT
char leftbuf[buflen], rightbuf[buflen];


/*---------------------------------------------------------------------------*\
 *   Functions                                                               *
\*---------------------------------------------------------------------------*/



int readword( char *buf )
{
	int nbytes, len, i;


	/* check first byte is 0 */
	nbytes = read( infile, buf, MULT );
	if ( nbytes < 0 )
	{
		perror("read");
		fprintf(stderr,"input file %s\n", filenam );
		return -1;
	}
	if ( nbytes < MULT )
	{
		fprintf(stderr,"extract-acl: couldn't read word!\n");
		return -1;
	}
#ifdef XP
	if ( buf[0] || buf[1] )
	{
		fprintf(stderr,"extract-acl: no longer sync'ed!\n"
				"(expected 00, read %0X,%0X (%c%c)\n",
				buf[0], buf[1], buf[0], buf[1] );
		return -1;
	}
#else
	if ( buf[0] )
	{
		fprintf(stderr,"extract-acl: no longer sync'ed!\n"
				"(expected 0, read %0X = %c)\n", buf[0], buf[0] );
		return -1;
	}
#endif

	/* read length of next word */
	nbytes = read( infile, buf, MULT );
	if ( nbytes < 0 )
	{
		perror("read");
		fprintf(stderr,"input file %s\n", filenam );
		return -1;
	}
	if ( nbytes < MULT )
	{
		fprintf(stderr,"extract-acl: couldn't read length of word!\n");
		return -1;
	}
	len = buf[0];				/* words must not be longer than 255 chars */
	if ( ! len )
		return 0;
	if ( len*MULT >= buflen )
	{
		buf[buflen-1] = 0;
		fprintf(stderr,"extract-acl: word too long (%d)!\n", len );
		return -1;
	}

	/* read word */
	nbytes = read( infile, buf, len*MULT );
	if ( nbytes < 0 )
	{
		perror("read");
		fprintf(stderr,"input file %s\n", filenam );
		return -1;
	}
	if ( nbytes < len*MULT )
	{
		fprintf(stderr,"extract-acl: read only %d of %d char's of word (%s)!\n",
				nbytes, len, buf );
		return -1;
	}

#ifdef MULT
	/* convert 16-bit chars to 8-bit chars */
	for ( i = 0; i < len; i ++ )
	{
		if ( buf[i*MULT] == 0x19 && buf[i*MULT+1] == 0x20 )
			buf[i] = '\'';
		else
			buf[i] = buf[i*MULT];
	}
#endif

	buf[len] = 0;
	return len;
}



int main( int argc, char **argv )
{
	int i, left, right, bad, nbytes, wordpairs;


	if ( argc < 2 )
	{
		fprintf(stderr,"extract-acl: not enough command ine arguments (%d)!\n",
				argc );
		exit(-1);
	}

	filenam = argv[1];
	infile = open( filenam, O_RDONLY );
	if ( infile < 0 )
	{
		perror("open");
		fprintf(stderr,"input file %s\n", filenam );
		exit(-1);
	}

	/* skip header */
	nbytes = read( infile, leftbuf, HEADER_LEN );
	if ( nbytes < 0 )
	{
		perror("read");
		fprintf(stderr,"input file %s\n", filenam );
		exit(-1);
	}
	if ( nbytes < HEADER_LEN )
	{
		fprintf(stderr,"extract-acl: less than %d bytes (%d) were read!\n",
				nbytes, HEADER_LEN );
		exit(-1);
	}

	wordpairs = 0;
	while ( 1 )
	{
		wordpairs ++ ;

		/* read left & right words */
		left = readword( leftbuf );
		if ( ! left )
		{
			fprintf(stderr,"%d word pairs\n", wordpairs-1 );
			break;							/* end of word list */
		}
		if ( left < 0 )
		{
			fprintf(stderr,"error at %d-th left word!\n", wordpairs );
			exit(-1);
		}
		right = readword( rightbuf );
		if ( right <= 0 )
		{
			fprintf(stderr,"error at %d-th right word!\n", wordpairs );
			exit(-1);
		}

		/* if left word contains any non-alphanum char, skip this pair */
		bad = 0;
		for ( i = 0; i < left; i ++ )
			if ( ! isalnum( leftbuf[i] ) )
			{
				bad = 1;
				break;
			}
		if ( bad )
		{
			fprintf(stderr,"bad character in word (%s)\n", leftbuf );
			continue;
		}

		/* if right word contains any non-ascii, skip */
		bad = 0;
		for ( i = 0; i < right; i ++ )
			if ( ! isascii( rightbuf[i] ) )
			{
				bad = 1;
				break;
			}
		if ( bad )
		{
			fprintf(stderr,"bad character in word (%s)\n", rightbuf );
			continue;
		}

		/* skip too small words */
		if ( left < 3 )
		{
			fprintf(stderr,"skipping too small word (%s)\n", leftbuf );
			continue;
		}

		printf("iab %s %s\n", leftbuf, rightbuf );
	}

	return 0;
}


