/*
 * corpus.h
 *
 *	Note: word is an instance of term
 *
 */

#ifndef CORPUS_H_
#define CORPUS_H_

#include <gsl/gsl_permutation.h>

typedef struct doc {
	int total;
	int yd;
	int* words;
} doc;

typedef struct Corpus {
	int total;
	int ndocs;
	int max_length; // the maximum length of paragraphs
	doc* docs;
} Corpus;

typedef struct id_2_word {
	int id;
	char word_str[100];
} id_2_word;

typedef struct vocabulary {
	int size;
	id_2_word* word_map;
} vocabulary;

/*
 * functions
 */
void read_data(char* dir, char* file, char *file_tr, Corpus* c_tr, Corpus* c_te, double percent);
void write_corpus(Corpus*, char*);

void split_train_test_data(char* dir, char** file);


void split_corpus(Corpus* s, Corpus* c1, Corpus* c2);

vocabulary* read_vocabulary(char*);
void write_vocabulary(vocabulary*, char*);

void free_corpus(Corpus*);
void free_vocabulary(vocabulary*);


#endif /* CORPUS_H_ */
