#!/bin/env tclsh # GOAL 1. find common PREFIX in random but structured data # 2. LONGER prefix is better than SHORTER prefix # 3. ONLY interested in prefix with MIN 10% usage of input list set namesL { MK_SRT MkTypeS§S§M§P SQ3_VAL sqlite3_context Fts5ExtensionApi sqlite3_module Sq3ScanStatE Fts5PhraseIter§S char§G MkBoolE§E MkLogFileS§S§M§P Sq3TextE Sq3IoCapEF§E sqlite3_rtree_geometry§S sqlite3_pcache_methods§S char§P Sq3IndexConstraintEF MK_STR short MkBufferS§S§M§P sqlite3_pcache SQ3_LITEN Sq3StatementS§S Sq3DeSerializeE§E MkBufferListS§S Sq3SyncEF§E Fts5Tokenizer Sq3ChangesetE Sq3StatementS MkObjectS§S§P short§G MK_BINB§M§P Sq3VtabE MkBufferS§S§P MkObjectProtectS SQ3_STMTN MK_PTR Sq3TxnE Sq3DbConfigE§E MK_OBJ sqlite3_pcache§S MK_NAT_LIST MK_ERREXTN Sq3TextE§E Sq3ConflictResolutionE§E Sq3TestCtrlE Sq3PrepareEF MkErrorS§S§P Sq3LiteS§S§M§P MK_BFL sqlite_uint64 Sq3ChangesetE§E Sq3LiteS§S§P Sq3ChangeSetConflictE§E MK_LONG MkBufferS§S MkObjectS MkExtensionS§S§M§P double Sq3OpenEF MkTypeE§E Sq3BlobS sqlite3_int64 MK_TYPN fts5_api fts5_tokenizer sqlite3_pcache_methods2 sqlite3 Sq3IndexConstraintEF§E sqlite3_mutex_methods§S sqlite3_value sqlite3_api_routines§S sqlite3_vfs Sq3ShmLockE§E Sq3LockE§E sqlite3_index_info§S Sq3TypeE§E MK_RT SQ3_FILE sqlite3_module§S Sq3TestCtrlE§E Sq3PrepareEF§E MK_PTRB sqlite3_io_methods MK_BIN Sq3ConfigE§E sqlite3_pcache_methods2§S sqlite3§S Sq3FcntlE§E MkStringR Sq3AuthActionE§E sqlite3_backup MK_LFLN Sq3ValueS§S§P MK_MXP Sq3SessionObjConfig Sq3SerializeE sqlite3_backup§S MK_ERREXT MK_PTRN MK_LFL MK_EXTN SQ3_VALN Sq3DbConfigE Fts5PhraseIter sqlite3_str§S Sq3FileS MK_BUFN MK_ATO int§G MK_CBP Sq3ErrorE§E sqlite3_api_routines MK_ERRN MkBufferListS§S§P sqlite3_vfs§S Sq3TraceEF Fts5Tokenizer§S sqlite3_vtab_cursor§S sqlite3_mem_methods§S Sq3AuthActionE MkBufferS MK_CCP MkRtExtS§S§M§P sqlite_int64 MK_ULN MkErrorE sqlite3_blob§S sqlite3_pcache_page§S Sq3CheckpointE§E sqlite3_context§S Fts5ExtensionApi§S fts5_tokenizer§S MkExtensionS MkExtensionCNR int64_t Sq3ConflictResolutionE MkExtensionS§S§M Sq3ValueS§S§M§P Fts5Context§S sqlite3_pcache_methods Sq3DbStatusE§E MkErrorS MkExtensionS§S§P MkObjectProtectS§S Fts5Context Sq3LimitE§E sqlite3_stmt MK_BOL sqlite3_vtab_cursor sqlite3_mem_methods MK_MNGN MK_NUM MK_OBJN MkExtensionS§S bool Sq3BlobS§S§M§P Sq3CheckpointE MkObjectS§S§M§P Sq3ShmLockE Sq3LockE MK_LLG MkNativeIsE void§M§P sqlite3_vtab MkTypeS§S§P int sqlite3_blob MK_TYP MkRuntimeS§S§P sqlite3_filename MK_ERR MkErrExtS§S§M§P MK_TIME_T Sq3ConfigE MK_STRB§P sqlite3_mutex§S char§G§M§P sqlite3_uint64 Sq3AccessE fts5_api§S sqlite3_mutex sqlite3_snapshot§S MK_NAT_OBJECT MK_USB Sq3OpenEF§E MkErrExtS§S§P MkTimeoutE MkLogFileS Sq3DbStatusE Sq3StatusE§E MkBinaryR MkErrorS§S§M§P MK_USI sqlite3_rtree_geometry MK_STRB§M§P sqlite3_rtree_query_info§S MK_SIG MK_SIZE SQ3_BLOBN sqlite3_snapshot MK_LSTB MK_DBG SQ3_LITE Sq3SyncEF char§G§P Sq3LiteS MkLogFileS§S MK_DBL MK_USS Sq3LimitE Sq3StatementS§S§P MK_BUF Sq3AccessE§E MK_FLT sqlite3_value§S float Sq3BlobS§S§P MK_USW Sq3MutexE SQ3_FILENAME sqlite3_stmt§S MkExtensionCR MK_INT MkRuntimeS§S§M§P Sq3ValueS§S MkBufferListS§S§M§P sqlite3_vtab§S Sq3ValueS Sq3VtabE§E MK_BUS sqlite3_mutex_methods void§P MK_RTN MkBufferStreamS§S char§M§P uint64_t sqlite3_rtree_query_info MK_RTEXT Sq3TraceEF§E Sq3FileS§S§P MK_PTRB§P Sq3ExtendetResultCodesE§E MkErrorS§S MkTimeoutE§E MK_BOOL Sq3ScanStatE§E sqlite3_io_methods§S SQ3_FILEN MK_EXP long§G§L MK_PTRB§M§P MkBufferStreamS§S§P Sq3ErrorE MkTypeE } proc findCommonPrefix {namesL} { set rangeL {1 2 3 4 5 6 7 8} ; # check from 2 to 8 prefix chars # 1. found usage of prefix array set prefixA [list] foreach nmeS $namesL { foreach wideI $rangeL { if {[string length $nmeS]-1 < $wideI} continue set pfxS [string range $nmeS 0 $wideI] incr prefixA($pfxS) } } # 2. number of minimal usage per prefix (10%) set minNum [expr {int([llength $namesL] * 0.1)}] # 3. setup input data set prefixL3 [list] foreach {pfxS pfxNum} [array get prefixA] { if {$pfxNum < $minNum} continue lappend prefixL3 $pfxS [string length $pfxS] $pfxNum } # 4. sort list to compare neighbour set prefixL3 [lsort -stride 3 -index 0 $prefixL3] # 5. start work set resultL [list] # in order list with stride=3 go from left to right for {set i 0} {$i < [llength $prefixL3]} {incr i 3} { lassign [lrange $prefixL3 $i $i+2] iS iLen iNum # list of "to-be-deleted" items set tuBeDelL [list] # for every "i" go to right and check for "common" for {set j [expr {$i+3}]} {$j < [llength $prefixL3]} {incr j 3} { lassign [lrange $prefixL3 $j $j+2] jS jLen jNum # if "i" is a prefix from "j" ? if {$iLen < $jLen && $iS eq [string range $jS 0 $iLen-1]} { if {$iNum > $jNum} { # "i" better than "j" → delete "j" lappend tuBeDelL $j } elseif {$iNum == $jNum} { # "j" better than "i" because "j" has more chars → delete "i" if {[llength $tuBeDelL]} { # dont forget to delte all "to-be-deleted-list" first set prefixL3 [lreplace $prefixL3 [lindex $tuBeDelL 0] [lindex $tuBeDelL end]+2] } # because "i" ist deleted the "for" loop have to repeat "i" position set tuBeDelL $i incr i -3 break } else { # never reached? → "i" and "j" are different → "i" is also "best-in-sub-group" lappend resultL $iS $iNum } } else { lappend resultL $iS $iNum break ; # mew stuff } } # something to delete ? if {[llength $tuBeDelL]} { set prefixL3 [lreplace $prefixL3 [lindex $tuBeDelL 0] [lindex $tuBeDelL end]+2] } } # last "i" is always best lappend resultL $iS $iNum # kill duples set resultL [lsort -unique -stride 2 -index 0 $resultL] # order by large "pfxNun" usage first lsort -stride 2 -integer -index 1 -decreasing $resultL } # --------------------------------------------------------- puts "\nresult: length namesL = [llength $namesL]\n" # print result set fmt {result: %-10s %s} puts [format $fmt pfxS pfxNum] puts [format $fmt ---------- -----] foreach {pfxS pfxNum} [findCommonPrefix $namesL] { puts [format $fmt $pfxS $pfxNum] }
Result
result: length namesL = 255 result: pfxS pfxNum result: ---------- ----- result: Sq3 68 result: MK_ 55 result: sqlite3 50 result: Mk 44