/*
 * corpus.c
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <sys/stat.h>

#include "corpus.h"

int v_size;
#ifdef MULTIC
int nclass;
int *ndocs_class;
#endif

void split_train_test_data(char* dir, char** file)
{
/*	FILE* fileptr, *fileptr_tr, *fileptr_te;
	char fileName[BUFSIZ];
	int i, j, l, nparas, nwords, n, n_tr, nn;
	int word;
	int tmp;

	for(i = 0; i < I; ++i){
		printf("reading data from %s/%s\n", dir, file[i]);
		sprintf(fileName, "%s/%s", dir, file[i]);
		fileptr = fopen(fileName, "r");
		sprintf(fileName, "%s/train_%s", dir, file[i]);
		fileptr_tr = fopen(fileName, "w");
		sprintf(fileName, "%s/test_%s", dir, file[i]);
		fileptr_te = fopen(fileName, "w");
		if(!fileptr){
			printf("Cannot open file %s/%s\n", dir, file[i]);
			exit(0);
		}

		n = 0;
		while (1){
			tmp = fscanf(fileptr, "doc:%d\n", &nparas);
			assert(tmp != 0);
			if(tmp == EOF)
				break;
			++n;
			for (j = 0; j < nparas; j++){
				tmp = fscanf(fileptr, "para:%d\n", &nwords);
				assert(tmp != 0);
				for (l = 0; l < nwords; l++){
					tmp = fscanf(fileptr, "word:%d\n", &word);
					assert(tmp != 0);
				}
			}
		}
		fclose(fileptr);
		sprintf(fileName, "%s/%s", dir, file[i]);
		fileptr = fopen(fileName, "r");
		if(!fileptr){
			printf("Cannot open file %s/%s\n", dir, file[i]);
			exit(0);
		}
		n_tr = (int)n * 0.8;
		nn = 0;
		while (1){
			tmp = fscanf(fileptr, "doc:%d\n", &nparas);
			assert(tmp != 0);
			if(tmp == EOF)
				break;
			++nn;
			if(nn <= n_tr)
				fprintf(fileptr_tr, "doc:%d\n", nparas);
			else
				fprintf(fileptr_te, "doc:%d\n", nparas);

			for (j = 0; j < nparas; j++){
				tmp = fscanf(fileptr, "para:%d\n", &nwords);
				assert(tmp != 0);
				if(nn <= n_tr)
					fprintf(fileptr_tr, "para:%d\n", nwords);
				else
					fprintf(fileptr_te, "para:%d\n", nwords);
				for (l = 0; l < nwords; l++){
					tmp = fscanf(fileptr, "word:%d\n", &word);
					assert(tmp != 0);
					if(nn <= n_tr)
						fprintf(fileptr_tr, "word:%d\n", word);
					else
						fprintf(fileptr_te, "word:%d\n", word);
				}
			}
		}
		fclose(fileptr);
		fclose(fileptr_tr);
		fclose(fileptr_te);
	}*/
}

void read_data(char* dir, char* file, char *file_te, Corpus* c_tr, Corpus* c_te, double percent)
{
	FILE* fileptr;
	char fileName[BUFSIZ];
	int i, j, ndocs, ntr, nte, wt;
	int tmp;

	c_tr->total = 0;
	printf("reading train data from %s/%s\n", dir, file);
	sprintf(fileName, "%s/%s", dir, file);
	fileptr = fopen(fileName, "r");
	if(!fileptr){
		printf("Cannot open file %s/%s\n", dir, file);
		exit(0);
	}
	ndocs = 0;
	tmp = fscanf(fileptr, "%d\n", &ndocs);
	assert(tmp == 1);
	ntr = (int)(ndocs * percent);
	nte = ndocs - ntr;
	c_tr->docs = (doc*) malloc(sizeof(doc) * ntr);
	c_tr->ndocs = ntr;
	c_tr->total = 0;
	c_tr->max_length = 0;
#ifdef MULTIC
	nclass = 1;
#endif
	for(i = 0; i < ntr; i++){
		tmp = fscanf(fileptr, "%d", &c_tr->docs[i].yd);
		assert(tmp == 1);
#ifdef MULTIC
		assert(c_tr->docs[i].yd >= 0);
		if(nclass < c_tr->docs[i].yd + 1){
			nclass = c_tr->docs[i].yd + 1;
		}
#endif
		/*if(i % 2 == 0){
			c_tr->docs[i].yd = 0;
		}else{
			c_tr->docs[i].yd = 1;
		}*/

		tmp = fscanf(fileptr, " %d", &c_tr->docs[i].total);
		assert(tmp != 0);
		c_tr->total += c_tr->docs[i].total;
		if(c_tr->max_length < c_tr->docs[i].total){
			c_tr->max_length = c_tr->docs[i].total;
		}
		c_tr->docs[i].words = (int*)malloc(sizeof(int) * c_tr->docs[i].total);
		for(j = 0; j < c_tr->docs[i].total; j++){
			tmp = fscanf(fileptr, " %d", &c_tr->docs[i].words[j]);
			assert(tmp != 0);
			//c_tr->docs[i].words[j]--;
			assert(c_tr->docs[i].words[j] >= 0);
		}
		tmp = fscanf(fileptr, "\n");
		//assert(tmp != 0);
		wt = c_tr->total;
	}
	fclose(fileptr);
#ifdef MULTIC
	ndocs_class = (int*)calloc(nclass, sizeof(int));
	for(int i = 0; i < ntr; i++){
		ndocs_class[c_tr->docs[i].yd]++;
	}
#endif
	if(file_te){
		printf("reading test data from %s/%s\n", dir, file_te);
		sprintf(fileName, "%s/%s", dir, file_te);
		fileptr = fopen(fileName, "r");
		if(!fileptr){
			printf("Cannot open file %s/%s\n", dir, file);
			exit(0);
		}
		ndocs = 0;
		tmp = fscanf(fileptr, "%d\n", &ndocs);
		assert(tmp == 1);
		nte = (int)(ndocs);
		c_te->docs = (doc*) malloc(sizeof(doc) * nte);
		c_te->ndocs = nte;
		c_te->total = 0;
		c_te->max_length = 0;

		for(i = 0; i < nte; i++){
			tmp = fscanf(fileptr, "%d", &c_te->docs[i].yd);
			assert(tmp == 1);
#ifdef MULTIC
			assert(c_te->docs[i].yd >= 0);
#endif

			tmp = fscanf(fileptr, " %d", &c_te->docs[i].total);
			assert(tmp == 1);
			c_te->total += c_te->docs[i].total;
			c_te->docs[i].words = (int*)malloc(sizeof(int) * c_te->docs[i].total);
			if(c_te->max_length < c_te->docs[i].total){
				c_te->max_length = c_te->docs[i].total;
			}
			for(j = 0; j < c_te->docs[i].total; j++){
				tmp = fscanf(fileptr, " %d", &c_te->docs[i].words[j]);
				assert(tmp == 1);
				//c_te->docs[i].words[j]--;
				assert(c_te->docs[i].words[j] >= 0);
			}
			tmp = fscanf(fileptr, "\n");
			//assert(tmp != 0);
		}
	}else{
		c_te = NULL;
	}

	printf("\n>>>>>>statistic for loaded corpus <<<<<<\n");
	printf("number of documents      : %d\n", ndocs);
	printf("words total              : %d\n", wt);
	printf("max length               : %d\n", c_tr->max_length);
	printf(">>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<\n");
}


/*
 * write a corpus to file
 *
 */

void write_corpus(Corpus* c, char* filename)
{

}

/*
 * Split testing corpus to two parts
 *
 */
void split_corpus(Corpus* s, Corpus* c1, Corpus* c2)
{
	int i, l;

	c1->docs = (doc*)malloc(sizeof(doc)*s->ndocs);
	c1->ndocs = s->ndocs;
	c1->total = 0;
	c1->max_length = 0;

	c2->docs = (doc*)malloc(sizeof(doc)*s->ndocs);
	c2->ndocs = s->ndocs;
	c2->total = 0;
	c2->max_length = 0;

	for(i = 0; i < s->ndocs; i++){
		c1->docs[i].words = (int*)malloc(sizeof(int) * 1);
		c1->docs[i].total = 0;
		c1->docs[i].yd = s->docs[i].yd;

		c2->docs[i].words = (int*)malloc(sizeof(int) * 1);
		c2->docs[i].total = 0;
		c2->docs[i].yd = s->docs[i].yd;
		for(l = 0; l < s->docs[i].total; l++){
			if(l%2 == 0){
				c1->docs[i].words = (int*)realloc(c1->docs[i].words, sizeof(int) * (c1->docs[i].total + 1));
				c1->docs[i].words[c1->docs[i].total] = s->docs[i].words[l];
				c1->docs[i].total++;
			}else if(l%2 == 1){
				c2->docs[i].words = (int*)realloc(c2->docs[i].words, sizeof(int) * (c2->docs[i].total + 1));
				c2->docs[i].words[c2->docs[i].total] = s->docs[i].words[l];
				c2->docs[i].total++;
			}
		}
		if(c1->max_length < c1->docs[i].total ){
			c1->max_length = c1->docs[i].total;
		}

		if(c2->max_length < c2->docs[i].total ){
			c2->max_length = c2->docs[i].total ;
		}
		c1->total += c1->docs[i].total;
		c2->total += c2->docs[i].total;
	}
}

/*
 * create word_map which map the term in vocabulary to index
 *
 * file: the file contains the corpus vocabulary in the format: index:word_string
 */
vocabulary* read_vocabulary(char* file1)
{
	char word_str[BUFSIZ];
	int word_index, count;
	FILE* fileptr;
	vocabulary* v;

	printf("Reading vocabulary ...\n");
	v = (vocabulary*) malloc(sizeof(vocabulary));
	if(file1){
		fileptr = fopen(file1, "r");
		if(!fileptr){
			printf("Cannot open vocabulary file %s\n", file1);
			exit(0);
		}
		count = 0;
		v->word_map = (id_2_word*) malloc(sizeof(id_2_word) * 1);
		while ((fscanf(fileptr, "%s", word_str)) != EOF){
			v->word_map = (id_2_word*) realloc(v->word_map, sizeof(id_2_word) * (1 + count));
			v->word_map[count].id = count;
			strcpy(v->word_map[count].word_str, word_str);
			count++;
		}
		fclose(fileptr);
		v->size = count;
		printf("the size of the sift vocabulary: %d\n", v->size);
	}else{
		printf("vocabulary file null...\n");
		exit(0);
	}
	v_size = v->size;
	return (v);
}

/*
 * write vocabulary to a file, be used in test
 *
 * v:  vocabulary
 * vocabulary_file: file to write v
 */
void write_vocabulary(vocabulary* v, char* file1)
{
	FILE* fileptr;
	int i;

	fileptr = fopen(file1, "w");
	for (i = 0; i < v->size; i++){
		fprintf(fileptr, "%d:%s\n", v->word_map[i].id, v->word_map[i].word_str);
	}
	fclose(fileptr);
}

void free_corpus(Corpus* c){
	int i;
	for (i = 0; i < c->ndocs; i++){
		free(c->docs[i].words);
	}
	free(c->docs);

	free(c);
}

void free_vocabulary(vocabulary* v){
	free(v->word_map);
	free(v);
}
