Page 143

Write a cross-referencer that prints a list of all words in a document, and, for each word, a list of the line numbers on which it occurs. Remove noise words like "the", "and," and so on.

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

#include <string.h>

#define MAX_LEN 1000

struct ll_node {
	int line_num;
	struct ll_node *next;
};

struct tree_node {
	char *word;
	struct ll_node *line_ll;
	struct tree_node *left;
	struct tree_node *right;
};

int getword(char *word, int lim);
struct tree_node *add_node(struct tree_node *node, char *word, int line_num);
void print_tree(struct tree_node *root);

int main()
{
	char str[MAX_LEN];
	struct tree_node *root = NULL;

	int c, line_num = 1;
	while ((c = getword(str, MAX_LEN)) != EOF) {
		if (c == '\n') {
			line_num++;
		} else if (isalpha(str[0])) {
			root = add_node(root, str, line_num);
		}
	}

	print_tree(root);
}

struct tree_node *alloc_tree_node();
struct ll_node *alloc_ll_node();
struct tree_node *add_node(struct tree_node *node, char *word, int line_num)
{
	int comp_res;
	if (node == NULL) {
		node = alloc_tree_node();
		node->word = strdup(word);
		node->line_ll = alloc_ll_node();
		node->line_ll->line_num = line_num;
		node->left = NULL;
		node->right = NULL;
	} else if ((comp_res = strcmp(word, node->word)) == 0) {
		struct ll_node *ll_node = alloc_ll_node();
		ll_node->line_num = line_num;
		ll_node->next = node->line_ll;
		node->line_ll = ll_node;
	} else if (comp_res < 0) {
		node->left = add_node(node->left, word, line_num);
	} else { // comp_res > 0
		node->right = add_node(node->right, word, line_num);
	}

	return node;
}

struct tree_node *alloc_tree_node()
{
	return (struct tree_node *)malloc(sizeof(struct tree_node));
}

struct ll_node *alloc_ll_node()
{
	return (struct ll_node *)malloc(sizeof(struct ll_node));
}

void print_tree(struct tree_node *node)
{
	if (node != NULL) {
		print_tree(node->left);

		printf("%s: ", node->word);
		struct ll_node *ll_node = node->line_ll;
		do {
			printf("%d ", ll_node->line_num);
		} while ((ll_node = ll_node->next) != NULL);
		putchar('\n');

		print_tree(node->right);
	}
}

int getch();
void ungetch(int c);

int getword(char *word, int lim)
{
	int c, getch(void);
	void ungetch(int);
	char *w = word;

	if ((c = getch()) == '\n') {
		return '\n';
	}

	while (isspace(c))
		c = getch();

	if (c != EOF) {
		*w++ = c;
	}

	if (!isalpha(c)) {
		*w = '\0';
		return c;
	}

	for (; --lim > 0; w++) {
		if (!isalnum(*w = getch())) {
			ungetch(*w);
			break;
		}
	}
	*w = '\0';

	return word[0];
}

#define BUFSIZE 100
char buf[BUFSIZE];
int bufp = 0;

int getch(void)
{
	return (bufp > 0) ? buf[--bufp] : getchar();
}

void ungetch(int c)
{
	if (bufp >= BUFSIZE) {
		printf("ungetch: too many characters\n");
	} else {
		buf[bufp++] = c;
	}
}