//      concordance.c
//      spls benchmark concordance programme
//      
//      Copyright 2010 wpc <wpc@normanby>
//      
//      This program is free software; you can redistribute it and/or modify
//      it under the terms of the GNU General Public License as published by
//      the Free Software Foundation; either version 2 of the License, or
//      (at your option) any later version.
//      
//      This program is distributed in the hope that it will be useful,
//      but WITHOUT ANY WARRANTY; without even the implied warranty of
//      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//      GNU General Public License for more details.
//      
//      You should have received a copy of the GNU General Public License
//      along with this program; if not, write to the Free Software
//      Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
//      MA 02110-1301, USA.
#define NDEBUG
#include <pthread.h>

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define WORDTOKEN long 
#define STOPCHAR 255
#define DIGRAMBRANCH 1
#define HASHM 27
#define REPTHRESH 2
//#define TRACEBACK
struct NgramCell{	 
	struct NgramCell * lowerWordsAtThisLevel;  
    struct NgramCell * mother; 
	struct NgramCell * higherWordsAtThisLevel; 
	struct NgramCell * pointerDownToNgramPlus1level;
	int positionOfFirstOccurenceOfStartOfNgram;
	int count,previousInstancesOfThisSeq;
	char terminatingCell;
};
struct dictEntry{	 
	struct dictEntry * overflow;
	struct dictEntry * linkToAllOtherUniqueWords;
	int positionOfFirstOccurence;
	struct NgramCell * digrams[DIGRAMBRANCH];
	int count;
};
static struct dictEntry ** theDictionary; // all unique words indexed in this hash table
static int N;
static int filelength;
static struct dictEntry *listOfUniqueWords ;
static WORDTOKEN *wordArray;
static char ** wordStartArray;
static int * wordLengthArray;
static int likelyMaxNumberOfWords,actualWordCount;
static char * filebuf;
int tokenize();
void printCodeWord(int ,FILE * );
void checkFollowOnWordsForRepetitionAfterThisPositionAtDeptJ( struct NgramCell **,int,int,struct NgramCell *);
void recordWordAtPosition(WORDTOKEN theWord, int thePosition )
{
	int hashPosition = theWord % likelyMaxNumberOfWords;
	int i;
	register struct dictEntry * newEntry, *anEntry;
#ifndef NDEBUG
	printCodeWord(thePosition); 
#endif
	anEntry=theDictionary[hashPosition];
	if((anEntry) == NULL){
#ifndef NDEBUG
	  printf(" added to index\n");
#endif
		createNewRecord:
		newEntry = (struct dictEntry *) malloc(sizeof(struct dictEntry));		 
		newEntry->overflow=theDictionary[hashPosition];
		newEntry->count=1;
		newEntry->positionOfFirstOccurence = thePosition;
		for(i=0;i<DIGRAMBRANCH;i++)newEntry->digrams[i] = NULL;
		theDictionary[hashPosition]=newEntry;
		newEntry->linkToAllOtherUniqueWords = listOfUniqueWords;
		listOfUniqueWords = newEntry;
		anEntry = newEntry;
	}
	
	{
		
		//   first we need to check it is not just a clash in the hash table in which
        // case we will have to deal with it by overflow chaining.
       		while ( (wordArray[anEntry->positionOfFirstOccurence]!=theWord))
        	{
			anEntry = anEntry->overflow;
			if(anEntry == NULL){ goto createNewRecord;}
		}
		assert(wordArray[anEntry->positionOfFirstOccurence]==theWord);
		anEntry->count++;
		// this is not the first occurence of the word in the file so we consider the 
		// words that follow up to a max of N
		checkFollowOnWordsForRepetitionAfterThisPositionAtDeptJ(&(anEntry->digrams[wordArray[thePosition+1] %DIGRAMBRANCH]),thePosition,0,NULL);
	}
}
void newngram( struct NgramCell ** This,int Position,int J,struct NgramCell *mother){
	    struct NgramCell * pACell  ; 
        
		
        // this is the first time the suffix has had this added to it so we 
		// create a new NgramCell to record the suffix
		pACell = (struct NgramCell * )malloc(sizeof(struct NgramCell ));
		pACell->positionOfFirstOccurenceOfStartOfNgram = Position;
		pACell->lowerWordsAtThisLevel=NULL;
        pACell->higherWordsAtThisLevel =NULL; 
	    pACell->pointerDownToNgramPlus1level=NULL;
		pACell->count= 1;
#ifdef TRACEBACK
		int ok;struct NgramCell * back =mother;
		pACell->mother = mother;
		pACell->previousInstancesOfThisSeq=0;
		ok=1;
        while(ok && back != NULL)
        {// check if any ancestor is identical and increment count accordingly
			   int i;
               for( i=0;i<=J;i++)
			         if( wordArray[Position+i] != wordArray[back->positionOfFirstOccurenceOfStartOfNgram+i])ok=0;
			   if (ok) pACell -> previousInstancesOfThisSeq ++;
			   back= back -> mother;
	     }  
#endif
		pACell->terminatingCell=1;
		
	     *This = pACell;
	    return;
	}
void checkFollowOnWordsForRepetitionAfterThisPositionAtDeptJ( struct NgramCell ** This,int Position,int J, struct NgramCell *motherNode)
{	//if(J>=N) return;// we are only interested in concordances up to this level
    if(J+Position >= actualWordCount) return;
	struct NgramCell * pACell = * This; 
	register int thisword,existingword;
 
	if(pACell == NULL)
	{
		 newngram(This, Position,J,motherNode);
		 pACell = *This ;
		 if(J>=N)return;
		 pACell -> count =0;pACell ->terminatingCell=0;
	}
	if((thisword=wordArray[Position+J])!=(existingword=wordArray[pACell->positionOfFirstOccurenceOfStartOfNgram +J]))
	{ // we will have to recurse down the list to find an occurence of this suffix
        if(thisword>existingword)
	    {
		checkFollowOnWordsForRepetitionAfterThisPositionAtDeptJ( &(pACell->lowerWordsAtThisLevel),  Position,  J,motherNode);
	    }
	    else
	    {
		checkFollowOnWordsForRepetitionAfterThisPositionAtDeptJ( &(pACell->higherWordsAtThisLevel),  Position,  J,motherNode);
	    };
	   
	    return;
	}
	assert(wordArray[Position+J]==wordArray[pACell->positionOfFirstOccurenceOfStartOfNgram +J]);
	// we have found a repetition of a n Ngram of length J now see if it is part of an Ngram of length J+1
	pACell->count++;
	checkFollowOnWordsForRepetitionAfterThisPositionAtDeptJ( &(pACell->pointerDownToNgramPlus1level),  Position,  J+1,pACell);
}
void printCodeWord(int position,FILE*f)
{
 
	char *s = wordStartArray[position];
	fwrite(s, sizeof(char), wordLengthArray[position],f);
 
	 fprintf(f," ");
}
int countOccurences(struct NgramCell *pCell)
{ 
	 int i=0;
	 if (pCell==NULL) return 0;
#ifdef TRACEBACK
  i=pCell->previousInstancesOfThisSeq;
#endif
  return  pCell->count+i;
}
int offset(struct NgramCell *pCell)
{
		return    pCell->positionOfFirstOccurenceOfStartOfNgram ;
}
void listOccurences(struct NgramCell *pCell ,FILE *f)
{   
   if (pCell==NULL) return ; 
   if(pCell->terminatingCell)fprintf(f,"%d ",offset(pCell));
    
  listOccurences(pCell->pointerDownToNgramPlus1level,f);
  listOccurences(pCell->lowerWordsAtThisLevel,f);
  listOccurences(pCell->higherWordsAtThisLevel,f);
  
   
}
void printThisNgram( struct NgramCell *pCell, int level,FILE *f )
{
	if(pCell==NULL)return;
	if (level>N) return;
#ifdef TRACEBACK
	struct NgramCell *back;
#endif
	int j=countOccurences(pCell);int i;
	if (j<REPTHRESH) return;
	fprintf(f,"<\"");
	for(i=0;i<level;i++) printCodeWord(i+pCell ->positionOfFirstOccurenceOfStartOfNgram,f);
	
	fprintf(f,"\":(%d,[",j );
#ifdef TRACEBACK
	for(i=1,back=pCell->mother;i<=pCell->previousInstancesOfThisSeq;i++)
	{
		fprintf(f,"%d ",offset(back));back=back->mother;
	}
#endif 
	
	if (pCell->terminatingCell)fprintf(f,"%d ",offset(pCell));
	listOccurences(pCell->pointerDownToNgramPlus1level ,f);
	fprintf(f,"])>\n");
}
void printRepetitions( struct NgramCell *pCell, int level,FILE *f)
{
  
	if(pCell != NULL)
	{   printThisNgram(pCell,level ,f);
		if( pCell-> pointerDownToNgramPlus1level != NULL)
		{ 
			// this ngram was repeated at least once		
			printRepetitions(pCell-> pointerDownToNgramPlus1level,level + 1,f);
		}
		printRepetitions(pCell -> lowerWordsAtThisLevel,level,f);
		printRepetitions(pCell -> higherWordsAtThisLevel,level,f);
	}

}
#define NUM_THREADS 2
#define PMASK 1
struct threadblock{int mask; FILE *f;};

int enableprint=0;
void printResults( int mask,FILE *f)
{
	struct dictEntry * wordList=listOfUniqueWords ;
	int wordcount=0;
	int i;
	
	while (wordList != NULL)
	{
		if((wordArray[wordList->positionOfFirstOccurence] & PMASK)==mask)
		{
			wordcount++; 
			for(i=0;i<DIGRAMBRANCH;i++){
				 
				printRepetitions(wordList -> digrams[i],1,f);
			}
			 
		} 
			wordList = wordList -> linkToAllOtherUniqueWords;
			
	}
	//printf("words %d unique words %d\n",actualWordCount,wordcount);
} 


void * recordem(void * p){
	int i;	struct threadblock *t =p;
	int mask = t->mask;FILE *f=t->f;
	//printf("recordem(%d)\n",mask);
	for(i=0;i<actualWordCount;i++)
			if((wordArray[i]&PMASK)==mask)recordWordAtPosition(wordArray[i],i );
	if(enableprint)printResults(mask,f);
	pthread_exit(NULL);

	
}
int main(int argc, char** argv)
{	FILE * f;int i;
    pthread_t threads[NUM_THREADS];
    struct threadblock blocks[NUM_THREADS];
    char string[80];
    pthread_attr_t attr;

	if(argc <3 ) {
		printf("Usage : concordance filename number [P]\n");
		return -1;
	}
	if(argc==4)if(argv[3][0]=='P')
	{
			enableprint=1;
			for(i=0;i<NUM_THREADS;i++)
			{
					sprintf(string, "%s%d.con",argv[1],i);
					blocks[i].f=fopen(string,"w");
			}
	}
	listOfUniqueWords=NULL;
	sscanf(argv[2],"%d",&N);
	f=fopen(argv[1],"r");
	if (f==NULL){printf("input file %s not found\n",argv[1]);exit(1);}
	fseek(f,0,SEEK_END);
	filelength = ftell(f);
	likelyMaxNumberOfWords = filelength/1;
	fseek(f,0,SEEK_SET);
	filebuf=(char *)malloc(sizeof(char)*(filelength+1));
	fread(filebuf, filelength,   1, f);    // block read for speed
	filebuf[filelength]=STOPCHAR;        // put in an end of file marker
	wordArray=(WORDTOKEN *)malloc(sizeof(WORDTOKEN)* likelyMaxNumberOfWords);// create array to hold tokenised input
	wordStartArray=(char **)malloc(sizeof(char*)* likelyMaxNumberOfWords);
	wordLengthArray=(int *)malloc(sizeof(int)* likelyMaxNumberOfWords);
    theDictionary = (struct dictEntry **)malloc(sizeof(struct dictEntry **)*likelyMaxNumberOfWords); // create and intialise has table
	for(i=0;i<likelyMaxNumberOfWords;i++)theDictionary[i]=NULL;
	actualWordCount=tokenize(filebuf);
	int rc;
    long t;
     /* Initialize and set thread detached attribute */
     pthread_attr_init(&attr);
     pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);


    for(t=0; t<NUM_THREADS; t++){
    
      blocks[t].mask=t;
      rc = pthread_create(&threads[t], NULL, recordem,(void*)& blocks[t]);
      if (rc){
         printf("ERROR; return code from pthread_create() is %d\n", rc);
         exit(-1);
      }
    }
   /* Free attribute and wait for the other threads */
   pthread_attr_destroy(&attr);
   for(t=0; t<NUM_THREADS; t++) {
	   void* status;
      rc = pthread_join(threads[t], &status);
      if (rc) {
         printf("ERROR; return code from pthread_join()  is %d\n", rc);
         exit(-1);
         }
     // printf("Main: completed join with thread %ld having a status   of %ld\n",t,(long)status);
      }

	 
	return 0;
}


//  tokenize -- this maps the file to a sequence of word tokens using hashing and a finite state recogniser
// this has a state machine that has the following states: APPENDING, SKIPING, HALTED
// it returns the actual number of words
#define APPENDING 0
#define SKIPING 1
#define HALTED 2
#define FIRSTCHAR 3
#define FIRSTSKIP 4
#define TOPSTATE FIRSTSKIP
char *statenames[TOPSTATE+1]={"APPENDING",
"SKIPING",
"HALTED",
"FIRSTCHAR",
"FIRSTSKIP"}; 
static char NextState[TOPSTATE][256];
int tokenize(char *f){
	int state = SKIPING;
     void initTransTable();
	WORDTOKEN tok=0;
	int count=0;
	initTransTable();

	while(state != HALTED){
     
		state=NextState[state][(int)(*f  ) ];
 
		switch (state){
			    case FIRSTCHAR: 
			    tok=0;
			    wordStartArray[count]=f;
				case APPENDING:
				tok = 0x7fffffff& ((HASHM*tok )+(*f   ));f  ++ ; break;
				case FIRSTSKIP:
				wordLengthArray[count] = f - wordStartArray[count];
				wordArray[count ++] = tok;
				case SKIPING:
				 f ++; 
		}
	}
	return count;
	}
void initTransTable()
{// this sets up the transition table to recognise words
 int c;
 for (c=0;c<256;c++){
	 if (c>127){NextState[HALTED][c]=NextState[SKIPING][c]=NextState[APPENDING][c] = NextState[FIRSTCHAR][c] =NextState[FIRSTSKIP][c]=HALTED;}
	 else if((c<='z' && c>='a')||(c<='Z' && c>='A')||c=='\''){
			NextState[FIRSTSKIP][c]=NextState[SKIPING][c]=FIRSTCHAR;
			NextState[FIRSTCHAR][c]=NextState[APPENDING][c] = APPENDING;
			NextState[HALTED][c]=HALTED;
	}else
	{
			NextState[SKIPING][c]=NextState[FIRSTSKIP][c] = SKIPING;
			NextState[APPENDING][c]=NextState[FIRSTCHAR][c]= FIRSTSKIP;
			NextState[HALTED][c]=HALTED;
	}
 }
 
}