// concordance.c // spls benchmark concordance programme // // Copyright 2010 wpc // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, // MA 02110-1301, USA. #define NDEBUG #include #include #include #include #define WORDTOKEN long #define STOPCHAR 255 #define DIGRAMBRANCH 1 #define HASHM 27 #define REPTHRESH 2 //#define TRACEBACK struct NgramCell{ struct NgramCell * lowerWordsAtThisLevel; struct NgramCell * mother; struct NgramCell * higherWordsAtThisLevel; struct NgramCell * pointerDownToNgramPlus1level; int positionOfFirstOccurenceOfStartOfNgram; int count,previousInstancesOfThisSeq; char terminatingCell; }; struct dictEntry{ struct dictEntry * overflow; struct dictEntry * linkToAllOtherUniqueWords; int positionOfFirstOccurence; struct NgramCell * digrams[DIGRAMBRANCH]; int count; }; static struct dictEntry ** theDictionary; // all unique words indexed in this hash table static int N; static int filelength; static struct dictEntry *listOfUniqueWords ; static WORDTOKEN *wordArray; static char ** wordStartArray; static int * wordLengthArray; static int likelyMaxNumberOfWords,actualWordCount; static char * filebuf; int tokenize(); void printCodeWord(int ,FILE * ); void checkFollowOnWordsForRepetitionAfterThisPositionAtDeptJ( struct NgramCell **,int,int,struct NgramCell *); void recordWordAtPosition(WORDTOKEN theWord, int thePosition ) { int hashPosition = theWord % likelyMaxNumberOfWords; int i; register struct dictEntry * newEntry, *anEntry; #ifndef NDEBUG printCodeWord(thePosition); #endif anEntry=theDictionary[hashPosition]; if((anEntry) == NULL){ #ifndef NDEBUG printf(" added to index\n"); #endif createNewRecord: newEntry = (struct dictEntry *) malloc(sizeof(struct dictEntry)); newEntry->overflow=theDictionary[hashPosition]; newEntry->count=1; newEntry->positionOfFirstOccurence = thePosition; for(i=0;idigrams[i] = NULL; theDictionary[hashPosition]=newEntry; newEntry->linkToAllOtherUniqueWords = listOfUniqueWords; listOfUniqueWords = newEntry; anEntry = newEntry; } { // first we need to check it is not just a clash in the hash table in which // case we will have to deal with it by overflow chaining. while ( (wordArray[anEntry->positionOfFirstOccurence]!=theWord)) { anEntry = anEntry->overflow; if(anEntry == NULL){ goto createNewRecord;} } assert(wordArray[anEntry->positionOfFirstOccurence]==theWord); anEntry->count++; // this is not the first occurence of the word in the file so we consider the // words that follow up to a max of N checkFollowOnWordsForRepetitionAfterThisPositionAtDeptJ(&(anEntry->digrams[wordArray[thePosition+1] %DIGRAMBRANCH]),thePosition,0,NULL); } } void newngram( struct NgramCell ** This,int Position,int J,struct NgramCell *mother){ struct NgramCell * pACell ; // this is the first time the suffix has had this added to it so we // create a new NgramCell to record the suffix pACell = (struct NgramCell * )malloc(sizeof(struct NgramCell )); pACell->positionOfFirstOccurenceOfStartOfNgram = Position; pACell->lowerWordsAtThisLevel=NULL; pACell->higherWordsAtThisLevel =NULL; pACell->pointerDownToNgramPlus1level=NULL; pACell->count= 1; #ifdef TRACEBACK int ok;struct NgramCell * back =mother; pACell->mother = mother; pACell->previousInstancesOfThisSeq=0; ok=1; while(ok && back != NULL) {// check if any ancestor is identical and increment count accordingly int i; for( i=0;i<=J;i++) if( wordArray[Position+i] != wordArray[back->positionOfFirstOccurenceOfStartOfNgram+i])ok=0; if (ok) pACell -> previousInstancesOfThisSeq ++; back= back -> mother; } #endif pACell->terminatingCell=1; *This = pACell; return; } void checkFollowOnWordsForRepetitionAfterThisPositionAtDeptJ( struct NgramCell ** This,int Position,int J, struct NgramCell *motherNode) { //if(J>=N) return;// we are only interested in concordances up to this level if(J+Position >= actualWordCount) return; struct NgramCell * pACell = * This; register int thisword,existingword; if(pACell == NULL) { newngram(This, Position,J,motherNode); pACell = *This ; if(J>=N)return; pACell -> count =0;pACell ->terminatingCell=0; } if((thisword=wordArray[Position+J])!=(existingword=wordArray[pACell->positionOfFirstOccurenceOfStartOfNgram +J])) { // we will have to recurse down the list to find an occurence of this suffix if(thisword>existingword) { checkFollowOnWordsForRepetitionAfterThisPositionAtDeptJ( &(pACell->lowerWordsAtThisLevel), Position, J,motherNode); } else { checkFollowOnWordsForRepetitionAfterThisPositionAtDeptJ( &(pACell->higherWordsAtThisLevel), Position, J,motherNode); }; return; } assert(wordArray[Position+J]==wordArray[pACell->positionOfFirstOccurenceOfStartOfNgram +J]); // we have found a repetition of a n Ngram of length J now see if it is part of an Ngram of length J+1 pACell->count++; checkFollowOnWordsForRepetitionAfterThisPositionAtDeptJ( &(pACell->pointerDownToNgramPlus1level), Position, J+1,pACell); } void printCodeWord(int position,FILE*f) { char *s = wordStartArray[position]; fwrite(s, sizeof(char), wordLengthArray[position],f); fprintf(f," "); } int countOccurences(struct NgramCell *pCell) { int i=0; if (pCell==NULL) return 0; #ifdef TRACEBACK i=pCell->previousInstancesOfThisSeq; #endif return pCell->count+i; } int offset(struct NgramCell *pCell) { return pCell->positionOfFirstOccurenceOfStartOfNgram ; } void listOccurences(struct NgramCell *pCell ,FILE *f) { if (pCell==NULL) return ; if(pCell->terminatingCell)fprintf(f,"%d ",offset(pCell)); listOccurences(pCell->pointerDownToNgramPlus1level,f); listOccurences(pCell->lowerWordsAtThisLevel,f); listOccurences(pCell->higherWordsAtThisLevel,f); } void printThisNgram( struct NgramCell *pCell, int level,FILE *f ) { if(pCell==NULL)return; if (level>N) return; #ifdef TRACEBACK struct NgramCell *back; #endif int j=countOccurences(pCell);int i; if (jpositionOfFirstOccurenceOfStartOfNgram,f); fprintf(f,"\":(%d,[",j ); #ifdef TRACEBACK for(i=1,back=pCell->mother;i<=pCell->previousInstancesOfThisSeq;i++) { fprintf(f,"%d ",offset(back));back=back->mother; } #endif if (pCell->terminatingCell)fprintf(f,"%d ",offset(pCell)); listOccurences(pCell->pointerDownToNgramPlus1level ,f); fprintf(f,"])>\n"); } void printRepetitions( struct NgramCell *pCell, int level,FILE *f) { if(pCell != NULL) { printThisNgram(pCell,level ,f); if( pCell-> pointerDownToNgramPlus1level != NULL) { // this ngram was repeated at least once printRepetitions(pCell-> pointerDownToNgramPlus1level,level + 1,f); } printRepetitions(pCell -> lowerWordsAtThisLevel,level,f); printRepetitions(pCell -> higherWordsAtThisLevel,level,f); } } #define NUM_THREADS 2 #define PMASK 1 struct threadblock{int mask; FILE *f;}; int enableprint=0; void printResults( int mask,FILE *f) { struct dictEntry * wordList=listOfUniqueWords ; int wordcount=0; int i; while (wordList != NULL) { if((wordArray[wordList->positionOfFirstOccurence] & PMASK)==mask) { wordcount++; for(i=0;i digrams[i],1,f); } } wordList = wordList -> linkToAllOtherUniqueWords; } //printf("words %d unique words %d\n",actualWordCount,wordcount); } void * recordem(void * p){ int i; struct threadblock *t =p; int mask = t->mask;FILE *f=t->f; //printf("recordem(%d)\n",mask); for(i=0;i127){NextState[HALTED][c]=NextState[SKIPING][c]=NextState[APPENDING][c] = NextState[FIRSTCHAR][c] =NextState[FIRSTSKIP][c]=HALTED;} else if((c<='z' && c>='a')||(c<='Z' && c>='A')||c=='\''){ NextState[FIRSTSKIP][c]=NextState[SKIPING][c]=FIRSTCHAR; NextState[FIRSTCHAR][c]=NextState[APPENDING][c] = APPENDING; NextState[HALTED][c]=HALTED; }else { NextState[SKIPING][c]=NextState[FIRSTSKIP][c] = SKIPING; NextState[APPENDING][c]=NextState[FIRSTCHAR][c]= FIRSTSKIP; NextState[HALTED][c]=HALTED; } } }