A while ago my brother and I started playing with the IP to country code tables provided by ARIN and others.
The problem is this:
Given IP 123.4.5.6 and a list of ranges like this:
0.0.0.0 to 4.255.255.255 us 5.0.0.1 to 5.10.255.255 nl 5.15.0.0 to 20.1.4.128 de ... 255.0.0.0 to 255.255.255.255 xx
find the range, if any, that corresponds to the given IP.
If you sort the list, you can do a binary search and you can improve upon that by providing an index.
You can get a copy of the full list (compiled mid 2002) here [L1 ] that the code below needs.
This code illustrates, I think it does anyway, that the way you do something in C is likely not the way to do it in Tcl. In Tcl, two separate lists that hold the IP ranges and corresponding country codes would likely be much, much faster. As it is, the Tcl (8.4.2 on linux) code is about 430 times slower than the C implementation.
16May2003 MS: Please see further down for a modified implementation that is about 25 times slower than C.
13May2003 PS.
This is the code in C:
#include <stdio.h> #include <stdlib.h> #define REC_OFFSET 8 #define REC_SIZE 18 struct record_t { unsigned int low; unsigned int high; char country[3]; }; unsigned int IpToUInt(const char *ipStr) { unsigned int d1=0, d2=0, d3=0, d4=0; sscanf(ipStr, "%u.%u.%u.%u", &d1, &d2, &d3, &d4); return ( ((d1 & 0xFF) << 24) | \ ((d2 & 0xFF) << 16) | \ ((d3 & 0xFF) << 8) | \ (d4 & 0xFF) ); } unsigned int HexToUInt(const char *hexIp) { return (int)strtoul(hexIp, NULL, 16); } int GetRecord(struct record_t *r, int num, FILE *infile) { char data[9]; data[8]=0; // terminate data. //seek to the selected record: if ( fseek(infile, REC_OFFSET + num * REC_SIZE, SEEK_SET)==-1) return 0; //read&parse low IP if( !fread(&data, 8, 1, infile) ) { return 0; } r->low = HexToUInt(data); //read&parse high IP if( !fread(&data, 8, 1, infile) ) { return 0; } r->high = HexToUInt(data); //read country if( !fread(r->country, 2, 1, infile) ) { return 0; } r->country[2]=0; //terminate string return 1; } const char *lookup(unsigned int ip, int records, struct record_t *r, int *index, int *hindex) { int steps=0; int found=0; unsigned int myMin=0, myMax=0, avg; myMin=index[ip >> 24]; myMax=hindex[ip >> 24]; while( myMax > myMin ) { steps++; avg = (myMax + myMin) /2; if( r[avg].low <= ip && r[avg].high >= ip ) { found=1; break; } else if (r[avg].low <= ip) { myMin = avg; } else { myMax = avg; } if (steps > 32) { //Infiniteloop break; } } if (found) { return r[avg].country; } else { //check one above: if ( avg < records ) if( r[avg+1].low <= ip && r[avg+1].high >= ip ) return r[avg+1].country; //check one below: if (avg > 0) if( r[avg-1].low <= ip && r[avg-1].high >= ip ) return r[avg-1].country; return 0; } } int main (int argc, char *argv[]) { unsigned int ip; char *country; FILE *infile=0; int i; char data[9]; struct record_t *r; int index[256], hindex[256], indexptr; int records=0; if (argc < 2) { fprintf(stderr, "Usage: lookup <dotted-decimal-ip> <...>\n"); return 2; } if( (infile=fopen("opentarget.dat", "r"))==NULL ) { fprintf(stderr, "Could not open database\n"); return 2; } data[8]=0; // terminate data. if ( fseek(infile, 0, SEEK_SET)==-1) return 0; if( !fread(&data, 8, 1, infile) ) { fprintf(stderr, "File read error while initialising\n"); return 2; } records = HexToUInt(data); if ( (r = malloc(sizeof(struct record_t)*records)) == NULL) { fprintf(stderr, "Insufficient memory\n"); return 2; } for (i=0; i<256; i++) index[i]=hindex[i]=0; index[0]=0; indexptr=0; for (i=0; i < records; i++) { //read&parse low IP if( !fread(&data, 8, 1, infile) ) { fprintf(stderr, "Error reading database record %d\n", i); return 2; } r[i].low = HexToUInt(data); //read&parse high IP if( !fread(&data, 8, 1, infile) ) { fprintf(stderr, "Error reading database record %d\n", i); return 2; } r[i].high = HexToUInt(data); //read country if( !fread(r[i].country, 2, 1, infile) ) { fprintf(stderr, "Error reading database record %d\n", i); return 2; } r[i].country[2]=0; //terminate string //add to index? if (indexptr < (r[i].low >> 24)) { indexptr=r[i].low >> 24; index[indexptr]=i; } } fclose(infile); for (i=1; i<256; i++) { if(index[i]==0) index[i]=index[i-1]; } hindex[255]=records; for (i=254; i>=0; i--) { if (index[i+1]>index[i]) { hindex[i]=index[i+1]; } else { hindex[i]=hindex[i+1]; } } //for (i=0; i<256; i++) printf("%04x", index[i]); //for (i=0; i<256; i++) printf("%04x", hindex[i]); //return 0; for (i=1; i<argc; i++) { // uncomment the next loop to do a speed test: //int j; //for (j=0; j<1000000; j++) { ip=IpToUInt(argv[i]); country=lookup(ip, records, r, index, hindex); //} if ( country!=NULL ) { printf("%s %s\n", argv[i], country); } else { printf("%s NOMATCH\n", argv[i]); } } return 0; }
The same code in Tcl looks like this:
namespace eval ::opentarget { set max 0 set theArray(0) foo set dbLoaded 0 set theList [list] proc calcIp {ip} { set lIp [split $ip .] while {[llength $lIp] < 4} { lappend lIp 0 } set res [expr ((((([lindex $lIp 0].0 * 256.0) + [lindex $lIp 1].0) * 256.0) + [lindex $lIp 2].0) * 256.0) + [lindex $lIp 3].0] return [lindex [split $res .] 0] } proc calcIpS {ip} { #calc IP, but drop the LSB set lIp [split $ip .] while {[llength $lIp] < 4} { lappend lIp 0 } set res [expr (((( [lindex $lIp 0]) * 256) + [lindex $lIp 1]) * 256) + [lindex $lIp 2]] return $res } proc calcIpHex {hex} { return [format %u 0x$hex] } proc openFile {fileName} { regsub -all {\\} $fileName {/} fileName variable max set if [open $fileName r] set lenHex [read $if 8] set len [format %i 0x$lenHex] set max [expr $len - 1] return $if } proc readDatabase {fileName} { regsub -all {\\} $fileName {/} fileName set key -1 variable theArray variable theList variable max variable dbLoaded set if [open $fileName r] set max [expr 1 * 0x[read $if 8]] while {![eof $if] && [string length [set entry [read $if 18]]] == 18} { lappend line [string range $entry 0 7] lappend line [string range $entry 8 15] lappend line [string range $entry 16 17] incr key set theArray($key) $line unset line if { ![string equal [string range $entry 6 7] "00"] } { #puts "$key starts at [string range $entry 6 7]" } if { ![string equal [string range $entry 14 15] "ff"] } { #puts "$key ends at [string range $entry 14 15]" } lappend line [calcIpHex [string range $entry 0 5]] lappend line [calcIpHex [string range $entry 8 13]] lappend line [string range $entry 16 17] lappend theList $line unset line } set max $key close $if set dbLoaded 1 } proc lookup {args} { variable theArray variable dbLoaded if {[llength $args] == 1 && !$dbLoaded} { error "Database not loaded" } elseif {[llength $args] == 1 && $dbLoaded} { lookupArray [lindex $args 0] } else { lookupFile [lindex $args 0] [lindex $args 1] } } proc lookupArray {ip} { variable theArray variable max variable dbLoaded if {!$dbLoaded} {error "database not loaded"} set nip [calcIp $ip] set myMin 0 set myMax $max set steps 0 set avg 0 while {[expr $myMax - $myMin] > 0 && $steps < 32} { incr steps if {$avg == 1} { set avg 0 } else { set avg [expr $myMax - (($myMax - $myMin) /2)] } set current $theArray($avg) set low [calcIpHex [lindex $current 0]] set hi [calcIpHex [lindex $current 1]] if {[expr "$low.0 <= $nip.0"] && [expr "$hi.0 >= $nip.0"]} { return [lindex $current 2] } elseif {[expr "$low.0 <= $nip.0"]} { set myMin $avg } else { set myMax $avg } } return "" } proc lookupArrayS {ip} { variable theList variable max variable dbLoaded if {!$dbLoaded} {error "database not loaded"} set nip [calcIpS $ip] set myMin 0 set myMax $max set steps 0 set avg 0 while {[expr $myMax - $myMin] > 0 && $steps < 32} { incr steps if {$avg == 1} { set avg 0 } else { set avg [expr $myMax - (($myMax - $myMin) /2)] } set current [lindex $theList $avg] set low [lindex $current 0] set hi [lindex $current 1] if {[expr $low <= $nip] && [expr $hi >= $nip]} { return [lindex $current 2] } elseif {[expr $low <= $nip]} { set myMin $avg } else { set myMax $avg } } return "" } proc lookupFile {fileName ip} { variable max set if [openFile $fileName] set found 0 set nip [calcIp $ip] set myMin 0 set myMax $max set steps 0 while {[expr $myMax - $myMin] > 0 && $steps < 32} { incr steps if {$myMax == 1} { set avg 0 } else { set avg [expr $myMax - (($myMax - $myMin) /2)] } seek $if [expr 8 + ($avg * 18)] start set low [calcIpHex [read $if 8]] set hi [calcIpHex [read $if 8]] set country [read $if 2] if {[expr $low.0 <= $nip.0] && [expr $hi.0 >= $nip.0]} { set found 1 set return $country break } elseif {[expr $low.0 <= $nip.0]} { set myMin $avg } else { set myMax $avg } } close $if if {!$found} { return "" } else { return $return } } } opentarget::readDatabase opentarget.dat puts [time { set foo [::opentarget::lookupArray $args] } 1000]
RS has this version:
proc ip2s ip {eval format %02x%02x%02x%02x [split $ip .]} set fp [open [file dirname [info script]]/opentarget.dat] read $fp 8 ;# skip record count set data {} while 1 { set from [read $fp 8] set to [read $fp 8] set country [read $fp 2] if [eof $fp] break lappend data s$from s$to $country } close $fp foreach arg $argv { set ips s[ip2s $arg] foreach {from to country} $data { if {$from<=$ips && $to>=$ips} {puts $arg:$country; break} } }
which may not be optimally fast, but responds fast enough when called from command line. Most of all, it is extremely short (compared to the original), so easyly maintained.
ulis, 2004-12-18. I don't know how slow is this script but I trust it.
MS did some performance tuning on the original algorithm presented here, modifying the list-based lookupArrayS to avoid runtime parsing (lookupArrayS1) and by a slight change in the loop algorithm (lookupArrayS2).
This produced a 17x speedup over the original (lookupArray) - so that this algorithm is now about 25 times slower than C.
proc lookupArrayS1 {ip} { variable theList variable max variable dbLoaded if {!$dbLoaded} {error "database not loaded"} set nip [calcIpS $ip] set myMin 0 set myMax $max set steps 0 set avg 0 while {$myMax > $myMin && $steps < 32} { incr steps if {$avg == 1} { set avg 0 } else { set avg [expr {$myMax - (($myMax - $myMin) /2)}] } set current [lindex $theList $avg] set low [lindex $current 0] set hi [lindex $current 1] if {$low <= $nip && $hi >= $nip} { return [lindex $current 2] } elseif {$low <= $nip} { set myMin $avg } else { set myMax $avg } } } proc lookupArrayS2 {ip} { variable theList variable max variable dbLoaded if {!$dbLoaded} {error "database not loaded"} set nip [calcIpS $ip] set myMin 0 set myMax $max while {$myMin <= $myMax} { set avg [expr {($myMax + $myMin) /2}] set current [lindex $theList $avg] # $current is now a 3-list of the form # {low-value hi-value country-code} if {[lindex $current 0] > $nip} { set myMax [expr {$avg - 1}] } elseif {[lindex $current 1] < $nip} { set myMin [expr {$avg + 1}] } else { return [lindex $current 2] } } # no suitable range found: return "" } % time {opentarget::lookupArray 128.1.0.1} 1000 2788 microseconds per iteration % time {opentarget::lookupArrayS 128.1.0.1} 1000 2044 microseconds per iteration % time {opentarget::lookupArrayS1 128.1.0.1} 1000 199 microseconds per iteration % time {opentarget::lookupArrayS2 128.1.0.1} 1000 161 microseconds per iteration % % expr 430*161/2788. 24.831420373
14May2003 PS Cool. Great improvement. I see a 13 fold increase between lookupArrayS and S2. Comparing with lookupArray is not fair because ArrayS only does a partial lookup, masking the final/LSB byte in the IP address. Is S2 still guaranteed to return? That is why I added the steps counter (which should actually start at 15, because there are only about 17000 entries in the list).
Now on to writing a TIP for [lsearch -(lower|upper)bound] with implementation and the Tcl version should be very close to the C version.
15May2003 MS I modified S2 slightly with an additional optimisation (having an esthetic rather than performance impact). S2 is indeed guaranteed to return, as (a) myMin or myMax converge at every non-returning iteration, (b) S2 returns "" when they cross over. The original S (and S1) can leave both myMin and myMax unmodified (when they differ by one). Note that this improved algorithm can also be used in the C implementation.