CommonPrefix in random List

#!/bin/env tclsh

# GOAL  1. find common PREFIX in random but structured data
#       2. LONGER prefix is better than SHORTER prefix
#       3. ONLY interested in prefix with MIN 10% usage of input list

set namesL {
   MK_SRT                 MkTypeS§S§M§P              SQ3_VAL                   sqlite3_context          Fts5ExtensionApi        
   sqlite3_module         Sq3ScanStatE               Fts5PhraseIter§S          char§G                   MkBoolE§E               
   MkLogFileS§S§M§P       Sq3TextE                   Sq3IoCapEF§E              sqlite3_rtree_geometry§S sqlite3_pcache_methods§S
   char§P                 Sq3IndexConstraintEF       MK_STR                    short                    MkBufferS§S§M§P         
   sqlite3_pcache         SQ3_LITEN                  Sq3StatementS§S           Sq3DeSerializeE§E        MkBufferListS§S         
   Sq3SyncEF§E            Fts5Tokenizer              Sq3ChangesetE             Sq3StatementS            MkObjectS§S§P           
   short§G                MK_BINB§M§P                Sq3VtabE                  MkBufferS§S§P            MkObjectProtectS        
   SQ3_STMTN              MK_PTR                     Sq3TxnE                   Sq3DbConfigE§E           MK_OBJ                  
   sqlite3_pcache§S       MK_NAT_LIST                MK_ERREXTN                Sq3TextE§E               Sq3ConflictResolutionE§E
   Sq3TestCtrlE           Sq3PrepareEF               MkErrorS§S§P              Sq3LiteS§S§M§P           MK_BFL                  
   sqlite_uint64          Sq3ChangesetE§E            Sq3LiteS§S§P              Sq3ChangeSetConflictE§E  MK_LONG                 
   MkBufferS§S            MkObjectS                  MkExtensionS§S§M§P        double                   Sq3OpenEF               
   MkTypeE§E              Sq3BlobS                   sqlite3_int64             MK_TYPN                  fts5_api                
   fts5_tokenizer         sqlite3_pcache_methods2    sqlite3                   Sq3IndexConstraintEF§E   sqlite3_mutex_methods§S 
   sqlite3_value          sqlite3_api_routines§S     sqlite3_vfs               Sq3ShmLockE§E            Sq3LockE§E              
   sqlite3_index_info§S   Sq3TypeE§E                 MK_RT                     SQ3_FILE                 sqlite3_module§S        
   Sq3TestCtrlE§E         Sq3PrepareEF§E             MK_PTRB                   sqlite3_io_methods       MK_BIN                  
   Sq3ConfigE§E           sqlite3_pcache_methods2§S  sqlite3§S                 Sq3FcntlE§E              MkStringR               
   Sq3AuthActionE§E       sqlite3_backup             MK_LFLN                   Sq3ValueS§S§P            MK_MXP                  
   Sq3SessionObjConfig    Sq3SerializeE              sqlite3_backup§S          MK_ERREXT                MK_PTRN                 
   MK_LFL                 MK_EXTN                    SQ3_VALN                  Sq3DbConfigE             Fts5PhraseIter          
   sqlite3_str§S          Sq3FileS                   MK_BUFN                   MK_ATO                   int§G                   
   MK_CBP                 Sq3ErrorE§E                sqlite3_api_routines      MK_ERRN                  MkBufferListS§S§P       
   sqlite3_vfs§S          Sq3TraceEF                 Fts5Tokenizer§S           sqlite3_vtab_cursor§S    sqlite3_mem_methods§S   
   Sq3AuthActionE         MkBufferS                  MK_CCP                    MkRtExtS§S§M§P           sqlite_int64            
   MK_ULN                 MkErrorE                   sqlite3_blob§S            sqlite3_pcache_page§S    Sq3CheckpointE§E        
   sqlite3_context§S      Fts5ExtensionApi§S         fts5_tokenizer§S          MkExtensionS             MkExtensionCNR          
   int64_t                Sq3ConflictResolutionE     MkExtensionS§S§M          Sq3ValueS§S§M§P          Fts5Context§S           
   sqlite3_pcache_methods Sq3DbStatusE§E             MkErrorS                  MkExtensionS§S§P         MkObjectProtectS§S      
   Fts5Context            Sq3LimitE§E                sqlite3_stmt              MK_BOL                   sqlite3_vtab_cursor     
   sqlite3_mem_methods    MK_MNGN                    MK_NUM                    MK_OBJN                  MkExtensionS§S          
   bool                   Sq3BlobS§S§M§P             Sq3CheckpointE            MkObjectS§S§M§P          Sq3ShmLockE             
   Sq3LockE               MK_LLG                     MkNativeIsE               void§M§P                 sqlite3_vtab            
   MkTypeS§S§P            int                        sqlite3_blob              MK_TYP                   MkRuntimeS§S§P          
   sqlite3_filename       MK_ERR                     MkErrExtS§S§M§P           MK_TIME_T                Sq3ConfigE              
   MK_STRB§P              sqlite3_mutex§S            char§G§M§P                sqlite3_uint64           Sq3AccessE              
   fts5_api§S             sqlite3_mutex              sqlite3_snapshot§S        MK_NAT_OBJECT            MK_USB                  
   Sq3OpenEF§E            MkErrExtS§S§P              MkTimeoutE                MkLogFileS               Sq3DbStatusE            
   Sq3StatusE§E           MkBinaryR                  MkErrorS§S§M§P            MK_USI                   sqlite3_rtree_geometry  
   MK_STRB§M§P            sqlite3_rtree_query_info§S MK_SIG                    MK_SIZE                  SQ3_BLOBN               
   sqlite3_snapshot       MK_LSTB                    MK_DBG                    SQ3_LITE                 Sq3SyncEF               
   char§G§P               Sq3LiteS                   MkLogFileS§S              MK_DBL                   MK_USS                  
   Sq3LimitE              Sq3StatementS§S§P          MK_BUF                    Sq3AccessE§E             MK_FLT                  
   sqlite3_value§S        float                      Sq3BlobS§S§P              MK_USW                   Sq3MutexE               
   SQ3_FILENAME           sqlite3_stmt§S             MkExtensionCR             MK_INT                   MkRuntimeS§S§M§P        
   Sq3ValueS§S            MkBufferListS§S§M§P        sqlite3_vtab§S            Sq3ValueS                Sq3VtabE§E              
   MK_BUS                 sqlite3_mutex_methods      void§P                    MK_RTN                   MkBufferStreamS§S       
   char§M§P               uint64_t                   sqlite3_rtree_query_info  MK_RTEXT                 Sq3TraceEF§E            
   Sq3FileS§S§P           MK_PTRB§P                  Sq3ExtendetResultCodesE§E MkErrorS§S               MkTimeoutE§E            
   MK_BOOL                Sq3ScanStatE§E             sqlite3_io_methods§S      SQ3_FILEN                MK_EXP                  
   long§G§L               MK_PTRB§M§P                MkBufferStreamS§S§P       Sq3ErrorE                MkTypeE                 
}


proc findCommonPrefix {namesL} {
  set rangeL  {1 2 3 4 5 6 7 8}   ; # check from 2 to 8 prefix chars

  # 1. found usage of prefix
  array set prefixA [list]
  foreach nmeS $namesL {
    foreach wideI $rangeL {
      if {[string length $nmeS]-1 < $wideI} continue
      set pfxS  [string range $nmeS 0 $wideI]
      incr prefixA($pfxS)
    }
  }

  # 2. number of minimal usage per prefix (10%)
  set minNum    [expr {int([llength $namesL] * 0.1)}]

  # 3. setup input data
  set prefixL3  [list]
  foreach {pfxS pfxNum} [array get prefixA] {
    if {$pfxNum < $minNum} continue
    lappend prefixL3 $pfxS [string length $pfxS] $pfxNum
  }

  # 4. sort list to compare neighbour
  set prefixL3  [lsort -stride 3 -index 0 $prefixL3]

  # 5. start work
  set resultL   [list]
  # in order list with stride=3 go from left to right
  for {set i 0} {$i < [llength $prefixL3]} {incr i 3} {
    lassign [lrange $prefixL3 $i $i+2] iS iLen iNum
    # list of "to-be-deleted" items
    set tuBeDelL  [list]
    # for every "i" go to right and check for "common"
    for {set j [expr {$i+3}]} {$j < [llength $prefixL3]} {incr j 3} {
      lassign [lrange $prefixL3 $j $j+2] jS jLen jNum
      # if "i" is a prefix from "j" ?
      if {$iLen < $jLen && $iS eq [string range $jS 0 $iLen-1]} {
        if {$iNum > $jNum} {
          # "i" better than "j" → delete "j"
          lappend tuBeDelL $j
        } elseif {$iNum == $jNum} {
          # "j" better than "i" because "j" has more chars → delete "i"
            if {[llength $tuBeDelL]} {
              # dont forget to delte all "to-be-deleted-list" first
              set prefixL3 [lreplace $prefixL3 [lindex $tuBeDelL 0] [lindex $tuBeDelL end]+2]
            }
          # because "i" ist deleted the "for" loop have to repeat "i" position
          set tuBeDelL $i
          incr i -3
          break
        } else {
          # never reached? → "i" and "j" are different → "i" is also "best-in-sub-group"
          lappend resultL $iS $iNum
        }
      } else {
        lappend resultL $iS $iNum
        break ; # mew stuff
      }
    }
    # something to delete ?
    if {[llength $tuBeDelL]} {
      set prefixL3 [lreplace $prefixL3 [lindex $tuBeDelL 0] [lindex $tuBeDelL end]+2]
    }
  }
  # last "i" is always best
  lappend resultL $iS $iNum

  # kill duples
  set resultL [lsort -unique -stride 2 -index 0 $resultL]

  # order by large "pfxNun" usage first
  lsort -stride 2 -integer -index 1 -decreasing $resultL
}

# ---------------------------------------------------------

puts "\nresult: length namesL = [llength $namesL]\n"

# print result
set fmt {result: %-10s %s}
puts [format $fmt pfxS pfxNum]
puts [format $fmt ---------- -----]
foreach {pfxS pfxNum} [findCommonPrefix $namesL] {
  puts [format $fmt $pfxS $pfxNum]
}

Result

result: length namesL = 255

result: pfxS       pfxNum
result: ---------- -----
result: Sq3        68
result: MK_        55
result: sqlite3    50
result: Mk         44