/* * rbsl.cpp - Red-Black Sorted List * * Source rbsl.cpp is available at * http://home.earthlink.net/~glenn_scheper/rbsl.cpp * * Windows executable rbsl.exe is available at * http://home.earthlink.net/~glenn_scheper/rbsl.exe * * Copyright ( C ) 2006 Glenn Scheper. This program is free software; * you can redistribute it and/or modify it under the terms of the GNU * General Public License as published by the Free Software Foundation; * either version 2 of the License, or any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. You should have * received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Contact author Glenn Scheper by glenn_scheper + at + earthlink.net * Download .EXE and .CPP at http://home.earthlink.net/~glenn_scheper/ * GNU General Public License: http://www.fsf.org/copyleft/gpl.html * * Sort "top-down 2-3-4 tree" implementation using the "RED-BLACK" framework * as instructed in Robert Sedgewick's _Algorithms_, 1983, Chapters 14 - 15. * * A single-file command line demonstration program * compilable in Open Watcom 1.5, or in Visual C++ 6. * * After porting my old pointer-based sort/count routine, * http://home.earthlink.net/~glenn_scheper/redblk.cpp * based on the book _Algorithms_, by Robert Sedgewick; * I had to dig out his valuable book to fix uncertainties. * * Other online red-black texts require pointers to parents: * http://www.nist.gov/dads/HTML/redblack.html * The fact that Sedgewick does not have parent pointers in nodes * make us need to keep a parentage. I guess a few are sufficient. * * Indeed, his point is that we keep splitting 4-nodes while going down. * A 2-node above a 4-node is tranformed into a 3-node above two 2-nodes; * A 3-node above a 4-node is tranformed into a 4-node above two 2-nodes; * * Split function is coded twice inline in Insert function. * Split function is expressed using Insert's variables. * Split restarts x high enough to not lose the search. * * Various illegal red-black relationships will require a rotate. * * There are many possible cases to rotate, but Sedgewick says that: * during rotate, re-use value to rediscover relevant son, grandson. * * Due to rotate the colors of son and grandson become switched. * Rotate does not do the color change; Rotate's caller does it. * * Finally, the act of inserting a new node is as if it were the middle * of a 4-node, having two ( empty ) red children: Immediately split it. * * Sedgewick goes on with many facts and relationships. * There are never two consecutive red links going down tree. * * Version 1.0 2006-08-12 * * main * | * |->GiveUsageMessage * | |..fprintf * | * |->RBSL:: * | | * | |->RBSL::RBSL * | | |..fprintf malloc exit memset * | | * | |->RBSL::~RBSL * | | |..fprintf free * | | * | |->RBSL::FindOrInsert * | | |..fprintf strlen realloc exit * | | :..StrcmpTrialValueInTailTo Rotate malloc memcpy * | | * | |->RBSL::TraverseTree * | | |..fprintf Consumer * | | * | |->RBSL::ListDiagnosticDump * | | |..fprintf * | | * | |->RBSL::StrcmpTrialValueInTailTo * | | |..fprintf strncmp strcmp * | | * | |->RBSL::Rotate * | | |..StrcmpTrialValueInTailTo fprintf * | * |..exit freopen fprintf setmode fgets isalpha * :..InputStuff.FindOrInsert * :..InputStuff.ListDiagnosticDump * :..InputStuff.TraverseTree * :..pOutputStuff->ListDiagnosticDump * :..pOutputStuff->TraverseTree * * RbsiConsumerCopyPrefixCount * |..sprintf pOutputStuff->FindOrInsert * * RbsiConsumerPrintAfterCount * |..printf * * RbsiConsumerPrintPrefixCount * |..printf * * RbsiConsumerPrintVerbatim * |..printf * * * * ******* Also note this other, more recent, web page: ******** * URL: http://home.earthlink.net/~glenn_scheper/rbsl2.cpp * | * Here is a non-compilable concatenation of some parts of WordsEx * freeware: http://home.earthlink.net/~glenn_scheper/WordsEx.exe * showing how I adapted the red-black sorted list algorithms for: * - object orientation, * - wide character set, * - multiple threads. * | * In particular, note that the routine to traverse the sorted list * was changed to a routine to return a copied vector of list items, * so that I could get in and out from under this critical section, * for the needful layering of critical sections to avoid conflicts. * * */ char RBSL_VersionAndDate [] = "RBSL version 1.0, Aug 12, 2006."; #include // for FILE, fprintf, etc. #include // for exit, rand, malloc, etc. #include // for memcpy, strcmp, etc. #include // for time #include // for isalpha, etc #include // for setmode #include // for O_BINARY #define MAX_INPUT_LINE 5000 void GiveUsageMessage( ) { fprintf( stdout, " RBSL sorts and counts whole text lines appearing in input, to stdout.\n" " RBSL demonstrates the red-black balanced sort in a top-down 2-3-4 tree,\n" " as instructed in Robert Sedgewick's _Algorithms_, 1983, Chapters 14 - 15.\n" " \n" " Usage: RBSL [flags] in.txt > out.txt;\n" " Or pipe ... | RBSL [flags] > out.txt;\n" " Flag -q ... sort output by Quantity.\n" " Flag -n ... prefix Number with item.\n" " Flag -r ... Reverse order of output.\n" " Flags -t/-w sort input Tokens/Words.\n" " Flag -d ... add program Diagnostics.\n" " \n" " %s Copyright ( C ) 2006 Glenn Scheper.\n" //...VersionAndDate here " RBSL comes with ABSOLUTELY NO WARRANTY; See source code for details.\n" " This is free software, and you are welcome to redistribute it under\n" " certain conditions; See GNU comment block in source code for details.\n" " \n" " Source available at http://home.earthlink.net/~glenn_scheper/rbsl.cpp\n" " Windows executable at http://home.earthlink.net/~glenn_scheper/rbsl.exe\n" " email: glenn_scheper + at + earthlink.net\n" , RBSL_VersionAndDate ); return; } // avoid char sign extension problems typedef unsigned char BYTE; // diagnostic debug file pointer, when non-null FILE * fpRBSL = NULL; // RBSL uses array [indices] instead of pointers. // The first two slots of RBSI array are special. // Slots #2 and up are real symbol-table entries. // No valid name lookup will have indices 0 or 1. // All unused nodes -> Tail. Before searching, fill Tail with key. // TAIL is set into all non-existant leaf links, to stop searches. // Tail has 0 links, so can never be "red" under some 3 or 4-node. // Tail -> Left ( Lower ) and Right ( Higher ) should never be needed. #define TAIL 0 // Head key is lower than any valid string. Right points to root. // List[HEAD] is a real node, so it can be as father to the root. #define HEAD 1 typedef struct _RBSI // Red-Black Sorted Item { BYTE * szKeyExtension; // if non-NULL, a malloc to rest of key string. BYTE KeySeven [7]; // First seven characters of key string. BYTE Red; // Two red siblings makes their father a 4-node. unsigned int Lower; // index to a child RBSI with a lower valued key. unsigned int Higher; // index to a child RBSI with a higher valued key. // Other members useful to some greater program: unsigned int ItemCountTotal; // and maybe pointers to other structures... // For example, where-appeared references list, // probability valuations, frequency in language, // part of speech and ontology info (esp. WordNet). } RBSI; // RBSI consumer callback routine formal parameter declaration typedef typedef void ( * RBSI_CONSUMER ) ( RBSI * pRBSI ); class RBSL // Red-Black Sorted List { public: RBSL( ); ~RBSL( ); unsigned int FindOrInsert( BYTE * InsertKey ); void TraverseTree( RBSI_CONSUMER Consumer, int ReverseOutputOrder ); void ListDiagnosticDump( ); RBSI * List; // dangerously public. I hand users indices into it. // potentially private: int StrcmpTrialValueInTailTo( unsigned int y ); unsigned int Rotate( unsigned int y ); unsigned int ListMallocCount; unsigned int ListFilledCount; BYTE * MallocTailKeyExtensionPtr; unsigned int MallocTailKeyExtensionSize; }; RBSL::RBSL( ) { if( fpRBSL ) fprintf( fpRBSL, "RBSL constructor.\r\n" ); ListMallocCount = 3; // head, tail, room for one more List = ( RBSI * ) malloc( sizeof( RBSI ) * ListMallocCount ); MallocTailKeyExtensionSize = 10; // Growable. No artificial ceilings here. MallocTailKeyExtensionPtr = ( BYTE * ) malloc( MallocTailKeyExtensionSize ); if( List == NULL || MallocTailKeyExtensionPtr == NULL ) { // Windows NEVER runs out of memory. But just in case... if( fpRBSL ) fprintf( fpRBSL, "RBSL constructor malloc failure.\r\n" ); exit( 1 ); } ListFilledCount = 2; // including tail ( #0 ) and head ( #1 ) List[TAIL].szKeyExtension = MallocTailKeyExtensionPtr; List[TAIL].szKeyExtension[0] = NULL; memset( List[TAIL].KeySeven, 0, sizeof( List[TAIL].KeySeven ) ); List[TAIL].Red = 0; List[TAIL].Lower = TAIL; // original algorithm might look ahead for red. List[TAIL].Higher = TAIL; // original algorithm might look ahead for red. // Init other members useful to some greater program: List[TAIL].ItemCountTotal = 0; List[HEAD].szKeyExtension = NULL; memset( List[HEAD].KeySeven, 0, sizeof( List[HEAD].KeySeven ) ); List[HEAD].Red = 0; List[HEAD].Lower = TAIL; // although low side never used. List[HEAD].Higher = TAIL; // at the edge of the tree now. // Init other members useful to some greater program: List[HEAD].ItemCountTotal = 0; } RBSL::~RBSL( ) { if( fpRBSL ) fprintf( fpRBSL, "RBSL destructor.\r\n" ); if( List != NULL ) { // I must free any mallocs hanging off the listed structures. // This one is sometimes on, sometimes off. Put it on now. List[TAIL].szKeyExtension = MallocTailKeyExtensionPtr; unsigned int i = 0; for( ;; ) { if( i == ListFilledCount ) break; if( List[i].szKeyExtension != NULL ) free( List[i].szKeyExtension ); // including MallocTailKeyExtensionPtr // Free any other members useful to some greater program... i ++; } free( List ); // so many reallocs, so little time... } } unsigned int RBSL::FindOrInsert( BYTE * InsertKey ) { // This is a 2-3-4 binary tree insert, using a red-black balancing concept. // Two red sibling items means their father is a 4-node, needing splitting. // Sedgewick's routine assumes a prior search failure; lacks equality exit. if( fpRBSL ) fprintf( fpRBSL, "\r\n\r\nFindOrInsert passed value: <%s>.\r\n", InsertKey ); // Copy into tail. I could bite the bullet, and ask the string length, // but I'm a two-register-char-pointer kind of guy, ever since 8080 C. BYTE * from = InsertKey; BYTE * into = List[TAIL].KeySeven; int i = 0; for( ;; ) { if( ( *into = *from ) != NULL ) from++; into ++; if( ++ i == sizeof( List[TAIL].KeySeven ) ) break; } int LengthOfExtension = 0; // I'll be glad I kept this length. if( *from == NULL ) { // We're done. But in case exactly 7, append NULL. // TAIL always has its extension. I'd append NULL, // but leave NULL pointer for simplest comparison. List[TAIL].szKeyExtension = NULL; } else { // More chars to copy into TAIL extension. // Restore non-NULL pointer for extension. into = List[TAIL].szKeyExtension = MallocTailKeyExtensionPtr; int Remains = MallocTailKeyExtensionSize; for( ;; ) { if( ( *into ++ = *from ++ ) == NULL ) break; if( -- Remains == 0 ) { // we need to expand this memory before continuing to copy int Lacking = strlen( ( char * ) from ) + 1; // 1 for NULL int TotalNeed = MallocTailKeyExtensionSize + Lacking; int NewMallocSize = TotalNeed + TotalNeed * 11 / 10; // 110% BYTE * NewMallocPtr = ( BYTE * ) realloc( MallocTailKeyExtensionPtr, NewMallocSize ); if( NewMallocPtr == NULL ) // I should use NEW semantics { if( fpRBSL ) fprintf( fpRBSL, "Extending TailKey malloc failure.\r\n" ); exit( 1 ); } MallocTailKeyExtensionSize = NewMallocSize; MallocTailKeyExtensionPtr = NewMallocPtr; // Whither to continue copy? int Advanced = into - List[TAIL].szKeyExtension; List[TAIL].szKeyExtension = MallocTailKeyExtensionPtr; into = List[TAIL].szKeyExtension + Advanced; } } LengthOfExtension = into - List[TAIL].szKeyExtension; // including NULL } unsigned int greatgrandfather = HEAD; unsigned int grandfather = HEAD; unsigned int father = HEAD; unsigned int x = HEAD; for( ;; ) { // Each search iteration proceeds down the tree one generation. // However, a split may set the iterator x up some generations. // Original algorithm would have appended an equal key to the right // of all existing equal keys. I changed method so equal keys will // be discovered in this outermost loop, and the inner subroutines // will never encounter a test key that is equal to any tree entry. int PlusMinusOrZero = StrcmpTrialValueInTailTo( x ); if( PlusMinusOrZero == 0 ) { // This is the equality case that will re-use an existing node. // Or, match might be to self in TAIL, in which case, add node. if( fpRBSL ) fprintf( fpRBSL, "FindOrInsert found trial value <%-7.7s> equal to x <%-7.7s> at #%4d.\r\n", ( char* ) List[TAIL].KeySeven, ( char* ) List[x].KeySeven, x ); break; } // That equality test must occur before these loop re-assignments: // Otherwise match to self in TAIL has father=TAIL; All fall down. greatgrandfather = grandfather; grandfather = father; father = x; if( PlusMinusOrZero < 0 ) { if( fpRBSL ) fprintf( fpRBSL, "FindOrInsert found trial value <%-7.7s> lower than x <%-7.7s> at #%4d.\r\n", ( char* ) List[TAIL].KeySeven, ( char* ) List[x].KeySeven, x ); x = List[x].Lower; if( fpRBSL ) fprintf( fpRBSL, "Advancing x to x's low side child: New x is <%-7.7s> at #%4d.\r\n", ( char* ) List[x].KeySeven, x ); } else { if( fpRBSL ) fprintf( fpRBSL, "FindOrInsert found trial value <%-7.7s> higher than x <%-7.7s> at #%4d.\r\n", ( char* ) List[TAIL].KeySeven, ( char* ) List[x].KeySeven, x ); x = List[x].Higher; if( fpRBSL ) fprintf( fpRBSL, "Advancing x to x's high side child: New x is <%-7.7s> at #%4d.\r\n", ( char* ) List[x].KeySeven, x ); } // At this point, we have chosen/traversed a new x, // and the old value of x has become "father" of x. if( fpRBSL ) fprintf( fpRBSL, "Now considering x,f,g,gg = %4d of %4d of %4d of %4d.\r\n", x, father, grandfather, greatgrandfather ); // If both children of x are red, then x is a 4-node to be split. // Sedgewick calls this test the only "cost" to the inner loop. if( List[List[x].Lower].Red && List[List[x].Higher].Red ) { if( fpRBSL ) fprintf( fpRBSL, "Both children of x are red. So x at #%4d is a four-node to be split.\r\n", x ); // The book _Algorithms_ said the Split routine // should access the local variables of Insert, // meaning its swaps should be reflected here. // Therefore in-line Split into Insert 2 places. // ============================================ // Here starts copy #1 of Split, included inline. // ============================================ // x = Split ( TestKey, greatgrandfather, grandfather, father, x ); // function split( v: integer, gg,g,f,x:link ): link; // begin // x^.red=true; x^.l^.red:=false; x^.r^.red:=false; // if f^.red then // begin // g^.red:=true; // if( v( v lower than grandfather <%-7.7s> at #%4d.\r\n", ( char* ) List[TAIL].KeySeven, ( char* ) List[grandfather].KeySeven, grandfather ); } else { LowerThanGrandFather = 0; if( fpRBSL ) fprintf( fpRBSL, "Split found trial value <%-7.7s> higher than grandfather <%-7.7s> at #%4d.\r\n", ( char* ) List[TAIL].KeySeven, ( char* ) List[grandfather].KeySeven, grandfather ); } if( StrcmpTrialValueInTailTo( father ) <= 0 ) { if( fpRBSL ) fprintf( fpRBSL, "Split found trial value <%-7.7s> lower than father <%-7.7s> at #%4d.\r\n", ( char* ) List[TAIL].KeySeven, ( char* ) List[father].KeySeven, father ); LowerThanFather = 1; } else { if( fpRBSL ) fprintf( fpRBSL, "Split found trial value <%-7.7s> higher than father <%-7.7s> at #%4d.\r\n", ( char* ) List[TAIL].KeySeven, ( char* ) List[father].KeySeven, father ); LowerThanFather = 0; } // per book, "if ( v < g->Key ) <> ( v < f->Key ), Rotate gf. if( LowerThanGrandFather != LowerThanFather ) { if( fpRBSL ) fprintf( fpRBSL, "Per Sedgewick, if ( v < g->Key ) <> ( v < f->Key ), Rotate gf.\r\n" ); x = Rotate( grandfather ); } // and, in red father, always Rotate ggf: x = Rotate( greatgrandfather ); List[x].Red = 0; } // ============================================ // Here ends copy #1 of Split, included inline. // ============================================ } List[List[HEAD].Higher].Red = 0; // keep removing any redness at top } // As expressed in Sedgewick, Insert assumes a prior search failure. // Allow for a search success during Insert, whenever x wasn't TAIL. if( x != TAIL ) { if( fpRBSL ) fprintf( fpRBSL, "Trial value <%-7.7s> was found at existing node #%4d.\r\n", ( char* ) List[TAIL].KeySeven, x ); } else { x = ListFilledCount; // next available index/ID number if( fpRBSL ) fprintf( fpRBSL, "Trial value <%-7.7s> was not found until TAIL. Making new node #%4d.\r\n", ( char* ) List[TAIL].KeySeven, x ); int HoldOntoTheNewX = x; // Otherwise, next split causes wrong x return if( ListFilledCount == ListMallocCount ) { // we need to expand this list before adding new member. int NewMallocCount = ( ListMallocCount + 3 ) * 11 / 10; // 110% RBSI * NewMallocPtr = ( RBSI * ) realloc( List, sizeof( RBSI ) * NewMallocCount ); if( NewMallocPtr == NULL ) // I should use NEW semantics { if( fpRBSL ) fprintf( fpRBSL, "Adding new item malloc failure.\r\n" ); exit( 1 ); } ListMallocCount = NewMallocCount; List = NewMallocPtr; } ListFilledCount ++; // Initialize members of this new x item: // I'm glad I kept this length. if( LengthOfExtension == NULL ) { List[x].szKeyExtension = NULL; } else { List[x].szKeyExtension = ( BYTE * ) malloc( LengthOfExtension ); // including NULL // I hate to exit gracelessly. // But it's insane not to test every malloc. if( List[x].szKeyExtension == NULL ) { if( fpRBSL ) fprintf( fpRBSL, "Malloc new key extension failure.\r\n" ); exit( 1 ); } memcpy( List[x].szKeyExtension, List[TAIL].szKeyExtension, LengthOfExtension ); // including NULL } memcpy( List[x].KeySeven, List[TAIL].KeySeven, sizeof( List[TAIL].KeySeven ) ); // New edges of tree points back to Tail sentinel. List[x].Higher = TAIL; List[x].Lower = TAIL; List[x].Red = 0; // this node is still a singleton, not twain // Init other members useful to some greater program: List[x].ItemCountTotal = 0; // Split above might have changed x's father's identity. // So compare to current father before attaching to him. // I assume that means father's slot must be empty here? if( StrcmpTrialValueInTailTo( father ) < 0 ) { if( fpRBSL ) fprintf( fpRBSL, "Brand new node <%-7.7s> is lower than its father <%-7.7s> at #%4d, so change father.Lower from %d to %d.\r\n", ( char* ) List[x].KeySeven, ( char* ) List[father].KeySeven, father, List[father].Lower, x ); List[father].Lower = x; } else { if( fpRBSL ) fprintf( fpRBSL, "Brand new node <%-7.7s> is higher than its father <%-7.7s> at #%4d, so change father.Higher from %d to %d.\r\n", ( char* ) List[x].KeySeven, ( char* ) List[father].KeySeven, father, List[father].Higher, x ); List[father].Higher = x; } // I'm just following the recipe. It says to split after adding x. // x = Split ( TestKey, greatgrandfather, grandfather, father, x ); { // ============================================ // Here starts copy #2 of Split, included inline. // ============================================ // x = Split ( TestKey, greatgrandfather, grandfather, father, x ); // function split( v: integer, gg,g,f,x:link ): link; // begin // x^.red=true; x^.l^.red:=false; x^.r^.red:=false; // if f^.red then // begin // g^.red:=true; // if( v( v lower than grandfather <%-7.7s> at #%4d.\r\n", ( char* ) List[TAIL].KeySeven, ( char* ) List[grandfather].KeySeven, grandfather ); } else { LowerThanGrandFather = 0; if( fpRBSL ) fprintf( fpRBSL, "Split found trial value <%-7.7s> higher than grandfather <%-7.7s> at #%4d.\r\n", ( char* ) List[TAIL].KeySeven, ( char* ) List[grandfather].KeySeven, grandfather ); } if( StrcmpTrialValueInTailTo( father ) <= 0 ) { if( fpRBSL ) fprintf( fpRBSL, "Split found trial value <%-7.7s> lower than father <%-7.7s> at #%4d.\r\n", ( char* ) List[TAIL].KeySeven, ( char* ) List[father].KeySeven, father ); LowerThanFather = 1; } else { if( fpRBSL ) fprintf( fpRBSL, "Split found trial value <%-7.7s> higher than father <%-7.7s> at #%4d.\r\n", ( char* ) List[TAIL].KeySeven, ( char* ) List[father].KeySeven, father ); LowerThanFather = 0; } // per book, "if ( v < g->Key ) <> ( v < f->Key ), Rotate gf. if( LowerThanGrandFather != LowerThanFather ) { if( fpRBSL ) fprintf( fpRBSL, "Per Sedgewick, if ( v < g->Key ) <> ( v < f->Key ), Rotate gf.\r\n" ); x = Rotate( grandfather ); } // and, in red father, always Rotate ggf: x = Rotate( greatgrandfather ); List[x].Red = 0; } // ============================================ // Here ends copy #2 of Split, included inline. // ============================================ } x = HoldOntoTheNewX; // Otherwise, that split causes wrong x return } return x; } void RBSL::TraverseTree( RBSI_CONSUMER Consumer, int ReverseOutputOrder ) { // For generality, I am passed a consumer that accepts one RBSI. // I think I can make all the needed declarations w/o craziness. // Make an iterative version of the recursive tree print. unsigned int IndexStack[ 8 * sizeof( int ) ]; unsigned int IndexStackCount = 0; unsigned int CurrentIndex = List[HEAD].Higher; // root of tree int JustPopped = 0; if( CurrentIndex == TAIL ) { // This unusual exit path is seen when the list is empty. return; } for( ;; ) { // process any lower/left link // Did I say left? Generalize: if( ! JustPopped ) // prevent re-re-re-re-re-re-re-repush... { for(;;) { int left = ReverseOutputOrder ? List[CurrentIndex].Higher : List[CurrentIndex].Lower ; if( left == TAIL ) // stop marching left break; IndexStack[ IndexStackCount ++ ] = CurrentIndex; if( fpRBSL ) fprintf( stdout, "Going left, from %3d to %3d.\r\n", CurrentIndex, left ); CurrentIndex = left; } } JustPopped = 0; // disarm // No more to the left? Then do current node. { if( fpRBSL ) fprintf( stdout, "Doing current, %3d.\r\n", CurrentIndex ); // Just pass the whole RBSI item to a passed Consumer function. Consumer( List + CurrentIndex ); } // Process any higher/right link int right = ReverseOutputOrder ? List[CurrentIndex].Lower : List[CurrentIndex].Higher ; if( right != TAIL ) { if( fpRBSL ) fprintf( stdout, "Going right, from %3d to %3d.\r\n", CurrentIndex, right ); CurrentIndex = right; continue; } // Having done any left, the current, and any right, then pop. if( IndexStackCount == 0 ) { // But, if none to pop, loop is done. if( fpRBSL ) fprintf( stdout, "Traverse loop exits.\r\n" ); break; } CurrentIndex = IndexStack[ -- IndexStackCount ]; if( fpRBSL ) fprintf( stdout, "Popping, to %3d.\r\n", CurrentIndex ); // Having popped, prevent another left push out of current node. JustPopped = 1; } } void RBSL::ListDiagnosticDump( ) { // Just a diagnostic. if( fpRBSL ) fprintf( fpRBSL, "\r\n" ); if( fpRBSL ) fprintf( fpRBSL, "ListMallocCount = %d.\r\n", ListMallocCount ); if( fpRBSL ) fprintf( fpRBSL, "ListFilledCount = %d.\r\n", ListFilledCount ); unsigned int i = 0; for( ;; ) { if( i == ListFilledCount ) break; if( fpRBSL ) fprintf( fpRBSL, "#%d = <%-7.7s> <%s>, %s; lower=%d. higher=%d. count=%d.\r\n", i, List[i].KeySeven, List[i].szKeyExtension == NULL ? "NULL" : ( char * ) List[i].szKeyExtension, List[i].Red ? "red" : "black", List[i].Lower, List[i].Higher, List[i].ItemCountTotal ); i ++; } if( fpRBSL ) fprintf( fpRBSL, "\r\n" ); } int RBSL::StrcmpTrialValueInTailTo( unsigned int y ) { // It appears to me that the v=value of original algorithm // is always the one started with, and is copied into TAIL. // So Initialized List[TAIL].Key... is an implied argument. // Every test herein is as if asking of List[ item # y ], // return strcmp( List[TAIL].Key..., List[y].Key.... ) // I plan to keep first 7 letters of key in the RBSI, // and extend any long words using a separate malloc. if( fpRBSL ) fprintf( fpRBSL, "Strcmp trying trial value <%-7.7s> against Key <%-7.7s> at #%4d.\r\n", ( char* ) List[TAIL].KeySeven, ( char* ) List[y].KeySeven, y ); int PlusMinusOrZero = strncmp( ( char* ) List[TAIL].KeySeven, ( char* ) List[y].KeySeven, 7 ); if( PlusMinusOrZero != 0 ) return PlusMinusOrZero; // found a difference in the first seven chars BYTE * pt = List[TAIL].szKeyExtension; BYTE * py = List[y].szKeyExtension; if( pt == NULL ) { if( py == NULL ) { return 0; // trial/tail value is equal to y value } else { // y value being longer, is higher. return -1; // trial/tail value is lower than y value } } else { if( py == NULL ) { // trial/tail value being longer, is higher. return 1; // trial/tail value is higher than y value } else { if( fpRBSL ) fprintf( fpRBSL, "Strcmp trying extension value <%s> against Key <%s> at #%4d.\r\n", ( char* ) pt, ( char* ) py, y ); return strcmp( ( char* ) pt, ( char* ) py ); // find out... } } } unsigned int RBSL::Rotate( unsigned int y ) { // Rotate balances the 2-3-4 tree during the Insert operation. // First we test y, to pick a son of interest given trial value; // Then we test that son of y, to pick the grandson of interest; // Then we swap the son and grandson positions under y, unmoved. unsigned int son; unsigned int grandson; int TestedLower; if ( StrcmpTrialValueInTailTo( y ) < 0 ) { son = List[y].Lower; TestedLower = 1; } else { son = List[y].Higher; TestedLower = 0; } // Here the son and grandson swap their descendent links. // Here the old grandson receives as a child the old son. if ( StrcmpTrialValueInTailTo( son ) < 0 ) { grandson = List[son].Lower; if( fpRBSL ) fprintf( fpRBSL, "ROTATING( gs,s,y = %4d, %4d, %4d ).\r\n", grandson, son, y ); List[son].Lower = List[grandson].Higher; List[grandson].Higher = son; } else { grandson = List[son].Higher; if( fpRBSL ) fprintf( fpRBSL, "ROTATING( gs,s,y = %4d, %4d, %4d ).\r\n", grandson, son, y ); List[son].Higher = List[grandson].Lower; List[grandson].Lower = son; } // Here the father, y, receives as a child the old grandson. // Reuse the "if v < y->key" from above. That hasn't changed. if( TestedLower ) { List[y].Lower = grandson; } else { List[y].Higher = grandson; } return grandson; } void RbsiConsumerPrintVerbatim( RBSI * pRBSI ) { // Print item string ( the whole key ) to output line. printf( "%.7s%s\r\n", pRBSI->KeySeven, pRBSI->szKeyExtension ? ( char * ) pRBSI->szKeyExtension : "" ); } void RbsiConsumerPrintAfterCount( RBSI * pRBSI ) { // Skip over ten chars ( "%9d " ) to print item string to output line. printf( "%s\r\n", pRBSI->szKeyExtension + 3 ); // 10 - 7 = 3 } void RbsiConsumerPrintPrefixCount( RBSI * pRBSI ) { // Prefix ten chars: "%9d ", then add item string to output line. printf( "%9d %.7s%s\r\n", pRBSI->ItemCountTotal, pRBSI->KeySeven, pRBSI->szKeyExtension ? ( char * ) pRBSI->szKeyExtension : "" ); } // I seem to need to have this global beforehand, // so the next consumer can know it to add to it. RBSL * pOutputStuff; void RbsiConsumerCopyPrefixCount( RBSI * pRBSI ) { // The input RBSI comes from another ( inputing ) instance of RBSL. // Prefix ten chars: "%9d ", then add the passed RBSI item text. // Use the new formatted string to add a new item to a different // ( outputing ) RBSL class, that instance in which I am a member. // I tried to make this routine a member of the RBSL class. // And then it must be declared static to take its address. // That made acceptable syntax for me to pass this routine. // But when a static, it apparently knows not the instance. // Therefore my FindOrInsert call would need instance,dot,. // This is not on the main path for my end task for the RBSL, // so I don't care if I make an artificial ceiling on length: BYTE Reformatted [MAX_INPUT_LINE + 10 + 1]; sprintf( ( char * ) Reformatted, "%9d %.7s%s", pRBSI->ItemCountTotal, pRBSI->KeySeven, pRBSI->szKeyExtension ? ( char * ) pRBSI->szKeyExtension : "" ); pOutputStuff -> FindOrInsert( Reformatted ); } int main( int argc, char **argv ) { int ReorderByQuantity = 0; int ReverseOutputOrder = 0; int PrefixCountNumbers = 0; int SortInputWords = 0; int SortInputTokens = 0; while( argc > 1 && argv[1][0] == '-' ) { switch( argv[1][1] | ' ' ) { case 'r': ReverseOutputOrder = 1; break; case 'n': PrefixCountNumbers = 1; break; case 'q': ReorderByQuantity = 1; break; case 't': SortInputTokens = 1; break; case 'w': SortInputWords = 1; break; case 'd': fpRBSL = stdout; break; default: GiveUsageMessage( ); exit( 1 ); } argc --; argv ++; } if( argc > 2 ) { GiveUsageMessage( ); exit( 1 ); } if( argc > 1 ) { if( freopen( argv[1], "rb", stdin ) == ( FILE* )0 ) { fprintf( stderr, "RBSL: Cannot open input file <%s>.\r\n", argv[1] ); return 1; } } // Let's do it. setmode( 1, O_BINARY ); RBSL InputStuff; static BYTE InputLine [MAX_INPUT_LINE + 1]; // loop reads input lines, adds them to list. for( ;; ) { BYTE * scan = InputLine; if( fgets( ( char * ) InputLine, MAX_INPUT_LINE, stdin ) == ( char* ) 0 ) { break; } // rid unprintables, binary while( *scan != '\0' && *scan != '\r' && *scan != '\n' ) { if( *scan > '~' || *scan < ' ' ) *scan = ' '; scan++; } // eliminate CR LF *scan = '\0'; if( SortInputWords ) { scan = InputLine; for(;;) { while( ! isalpha( *scan ) && *scan != NULL ) scan ++; // determinate: stops on null if( *scan == NULL ) break; BYTE * Atop = scan; while( isalpha ( *scan ) ) // determinate: stops on null { *scan |= ' '; // to lowercase scan ++; } BYTE c = *scan; *scan = NULL; unsigned int index = InputStuff.FindOrInsert( Atop ); InputStuff.List[index].ItemCountTotal ++; *scan = c; if( *scan == NULL ) break; } } else if( SortInputTokens ) { scan = InputLine; for(;;) { while( *scan == ' ' ) scan ++; // determinate: stops on null if( *scan == NULL ) break; BYTE * Atop = scan; while( *scan > ' ' ) scan ++; // determinate: stops on null BYTE c = *scan; *scan = NULL; unsigned int index = InputStuff.FindOrInsert( Atop ); InputStuff.List[index].ItemCountTotal ++; *scan = c; if( *scan == NULL ) break; } } else { unsigned int index = InputStuff.FindOrInsert( InputLine ); InputStuff.List[index].ItemCountTotal ++; } // continue reading input lines. } if( fpRBSL ) InputStuff.ListDiagnosticDump( ); if( ReorderByQuantity ) { // Init global RBSL pointer that is known to the consumer function: pOutputStuff = new RBSL; InputStuff.TraverseTree( RbsiConsumerCopyPrefixCount, ReverseOutputOrder ); if( fpRBSL ) pOutputStuff -> ListDiagnosticDump( ); if( PrefixCountNumbers ) pOutputStuff -> TraverseTree( RbsiConsumerPrintVerbatim, ReverseOutputOrder ); else pOutputStuff -> TraverseTree( RbsiConsumerPrintAfterCount, ReverseOutputOrder ); delete pOutputStuff; pOutputStuff = NULL; } else { if( PrefixCountNumbers ) InputStuff.TraverseTree( RbsiConsumerPrintPrefixCount, ReverseOutputOrder ); else InputStuff.TraverseTree( RbsiConsumerPrintVerbatim, ReverseOutputOrder ); } return 0; }