A grep-like utility

Summary

Arjen Markus (18 february 2003) I wanted a good, concise script that shows what Tcl is good at. That is, a script that performs tasks that are difficult or awkward in system programming languages or require lots of code.

The script below uses the following techniques:

  • Glob-style string matching
  • Regular expressions
  • Interaction with the file system
  • Graphical user-interface in just a handful of lines of code
  • Reading from files without having to worry about the length of strings etc.

What it does is this:

  • Look for files matching the given pattern
  • Read each line and see if they match the textual pattern
  • If so, display the line and the (first) matching part in the main window

It could be enhanced with lots of extra options, manoeuvre through the directory tree, and so on. But to get people at least somewhat familiar with the techniques, this script (170 lines including comments) is adequate - I hope.

Program 1 - agrep


 # agrep.tcl --
 #    Script to emulate the UNIX grep command with a small GUI
 #

 # createWindow --
 #    Create the main window
 #
 # Arguments:
 #    None
 # Result:
 #    None
 # Side effects:
 #    Controls added to main window
 #
 proc createWindow {} {
    global filemask
    global pattern
    global ignore_case

    #
    # Menubar (simple)
    #
    frame .menubar -relief raised -borderwidth 1
    pack  .menubar -side top -fill x

    menubutton .menubar.file        -text File   -menu .menubar.file.menu
    menu       .menubar.file.menu   -tearoff false
    .menubar.file.menu add command -label Exit -command exit
    pack       .menubar.file -side left

    #
    # Fill in fields
    #
    frame       .f1
    label       .f1.empty       -text " "
    label       .f1.mask_label  -text "Files:" -justify left
    label       .f1.patt_label  -text "Regular expression:" -justify left
    entry       .f1.filemask    -textvariable filemask
    entry       .f1.pattern     -textvariable pattern
    checkbutton .f1.ignore_case -variable ignore_case -text "Ignore case"
    button      .f1.search      -command searchFiles -text "Search"

    grid .f1.empty      x             x
    grid .f1.mask_label .f1.filemask  .f1.search       -sticky w
    grid .f1.patt_label .f1.pattern   .f1.ignore_case  -sticky w

    pack .f1 -side top -fill x

    #
    # Result window
    #
    frame .f2
    text  .f2.text -font "Courier 10" \
       -yscrollcommand {.f2.y set} \
       -xscrollcommand {.f2.x set}
    scrollbar .f2.x -command {.f2.text xview} -orient horizontal
    scrollbar .f2.y -command {.f2.text yview}

    grid .f2.text .f2.y -sticky ns
    grid .f2.x    x     -sticky we

    pack .f2 -side top

    #
    # Just for the fun of it: define the styles for the "matched",
    # "error" and "fn" tags
    #
    .f2.text tag configure "matched" -underline 1 -background yellow
    .f2.text tag configure "fn"      -underline 1 -background lightblue
    .f2.text tag configure "error"   -background red
 }

 # searchFiles --
 #    Search for files in the current directory that match the given
 #    mask
 #
 # Arguments:
 #    None
 # Result:
 #    None
 # Side effects:
 #    Calls "searchPattern" to fill the result window
 #
 proc searchFiles {} {
    global filemask
    global pattern
    global ignore_case

    #
    # Clear the result window, then get a list of files
    #
    .f2.text delete 0.1 end

    if { $filemask == "" } {
       set filemask "*"
    }

    foreach file [glob $filemask] {
       if { [file isdirectory $file] } {
          continue ;# Might become a recursive descent later :)
       } else {
          searchPattern $file $pattern $ignore_case
       }
    }
 }

 # searchPattern --
 #    Search for lines containing the given pattern in a file
 #
 # Arguments:
 #    filename      Name of the file to be searched
 #    pattern       Given regular expression
 #    ignore_case   Ignore the case or not
 # Result:
 #    None
 # Side effects:
 #    Fills the result window
 #
 proc searchPattern {filename pattern ignore_case} {

    if { [ catch {
       set infile [open $filename "r"]

       .f2.text insert end "$filename:\n" fn

       while { [gets $infile line] >= 0 } {

          if { $ignore_case } {
             set match [regexp -nocase -indices -- $pattern $line indices]
          } else {
             set match [regexp -indices -- $pattern $line indices]
          }
          if { $match } {
             set  first [lindex $indices 0]
             set  last  [lindex $indices 1]
             .f2.text insert end [string range $line 0 [expr {$first-1}]]
             .f2.text insert end [string range $line $first $last] "matched"
             .f2.text insert end [string range $line [expr {$last+1}] end]
             .f2.text insert end "\n"
          }
       }

       close $infile

       } msg ] } {
         .f2.text insert end "$msg\n"
    }
    .f2.text insert end "\n"
 }

 # main --
 #    Main code to get it all going
 #
 global filemask
 global pattern
 global ignore_case

 set filemask    "*"
 set pattern     {}
 set ignore_case 1

 createWindow

Program 2

TV I'm not sure it is superfluous, but bwise for a long time had this one packed with it:


 proc grep { {a} {fs {*}} } {
   set o {}
   foreach n [lsort -incr -dict [glob $fs]] {
      set f [open $n r]
      set c 0
      set new 1
      while {[eof $f] == 0} {
         set l [gets $f]
         incr c
         if {[string first $a $l] > -1} {
            if {$new == 1} {set new 0; append o "*** $n:" \n}
            append o "$c:$l" \n
         }
      }
      close $f
   }
   return $o
 }
# Test with:
  catch {console show}
  puts "Result:\n[grep "require" "*.tcl"]"

The variable which contains the return value is formatted to be OK in a shell or console, but can easily be computer formatted too. Efficiency was fine on older PC's. It errs on subdirs when they match the search pattern I just came up with, but it is small.


This application would be great if it could scan large files (> 20 Mb) at a reasonable speed. Any ideas to achieve that?

AM I have never tried it on "large" files. What is the performance? Is it simply that there are a lot of hits (so that the display gets filled with a lot of lines)? Are your regular expressions complicated? (Hm, perhaps a small experiment ...)

I ran it on a directory with a dozen files. One of them has 23 Mb and the other one 123 Mb. The others are below 10 Kb. The "grep-like utility" freezes. After 5 minutes, I give up and kill it. A search with pure grep takes less than 10 seconds. Then I removed the 123-Mb file. The search then took 55 seconds, still slow. Few matches and a very simple "letters-only" regex. Speed has never been Tcl's strongest suit. Or maybe this particular application's code is not efficient.

AM Experiment:

  • File of 11 MB, 164214 lines
  • simple pattern "time"
  • 2826 lines containing that word
  • 30 seconds to read the file and display the result
  • 14 seconds to read the file and not display the result
  • 5 seconds to just read the file

What can we do about the performance?


schlenk:

  • don't call .f2.text insert end $something so often
 set text [list [string range $line 0 [expr {$first-1}]] {} \
     [string range $line $first $last] "matched" \
     "[string range $line [expr {$last+1}] end]\n" {} ]
 eval [linsert $text 0 .f2.text insert end]
  • fconfigure the channel with a large buffer
  • im not sure if blocks of lines instead of single lines with regexp -line helps speed up things:
     regexp -all -line -indices ...     
   foreach {first last} $indices {break}

MS wrote something up, but did not get around to test it. Consider maybe as "idea" rather than "code".

NOTE: if there are several matches in a single line, the original would highlight the first occurrence; this one highlights the last one instead. Should be corrected to highlight them all?

AM I did the experiment: with the original script I get some 13 seconds and with Miguel's version I get the same -- no obvious improvement. But: this is with Tcl 8.3.

Program 3


 # ATTENTION: requires inserting the following line in
 # the calling proc [searchFiles], before the loop that
 # iterates over all files:
 #      set pattern (.*)($pattern)(.*)
 # so that regexp will store the start-match-end 
 # parts of each line. 
 # The pattern above will highlight the last occurrence of
 # the requested pattern in a line; in order to get the
 # first, replace with
 #      set pattern (.*?)($pattern)(.*)$
 # This should be done in the
 # calling proc so that the compiled regexp need not be 
 # recompiled for each file.
 #
 # - NOT TESTED - 
 #
 # Improvements proposed here:
 #   1. make sure the input is buffered
 #   2. slurp in the file in larger chunks
 #   3. let [regexp] do the substring extraction
 #   4. coalesce text insertion (from above)
 
 proc searchPattern {filename pattern ignore_case} {
 
     set slurpSize 2000
     if { [ catch {
         set infile [open $filename "r"]
         fconfigure $infile -buffering full
         
         .f2.text insert end "$filename:\n" fn
         
         set chunk {}
         while 1 {
             append chunk [read $infile $slurpSize]
             if {![string length $chunk]} {
                 break
             }
             set lines [split $chunk "\n"]
             set chunk [lindex $lines end]
            set oldTail {}
            set res [list .f2.text insert end]
             
             foreach line [lrange $lines 0 end-1] {
                #
                # Use -inline instead of match variables, to insure
                # fastest access to these variables: [regexp] does not
                # need to access the variables at all, this proc accesses 
                # them by index (no lookup).
                #
                # An alternative in Tcl8.4+, slightly slower though, 
                # is to insure that the match vars are seen as local vars
                # by the compiler (I am actually not sure if this is really
                # needed, should dive into the compiler for [regexp]). To do
                # that, if you use the match vars {-> start match tail}, you
                # could insert at the top (outside the loop)
                #     foreach {-> start match fail} {} break
                # In this case, [regexp] still looks up the variables by
                # name but this proc body accesses the variables by index. 
                # (NOTE: the previous sentence was edited, it was wrong in its 
                # first version)
                #

                 if { $ignore_case } {
                     set matches [regexp -inline -nocase -- $pattern $line]
                 } else {
                     set matches [regexp -inline -- $pattern $line]
                 }
                 if {[llength $matches]} {
                     foreach {-> start match tail} $matches break
                       lappend res $oldTail$start {} $match "matched"
                       set oldTail $tail\n
                 }
            }
            if {[llength $res]} {
                eval [lappend res $oldTail {}]
            }
         }
         
         close $infile
         
     } msg ] } {
         .f2.text insert end "$msg\n"
     }
     .f2.text insert end "\n"
 }

AM (9 may 2005) Another thought: could it be that as used above, [regexp] re-compiles the pattern with each invocation? Complicated patterns could then require a lot of time to achieve this. Then it would be more efficient to change the procedure's body so that a constant pattern is seen by [regexp] .... I have not tested this idea yet.


Ross Cartlidge (19 October 2005) egrep is always going to be faster than any tcl/perl/python. So use it as a filter and thus only process the matched lines. Use this version of searchPattern

Program 4


 proc searchPattern {filename pattern ignore_case} {

   if {[catch {
        close [open $filename]
        .f2.text insert end "$filename:\n" fn
        set matches {}
        catch {
        if {$ignore_case} {
                set matches [exec egrep -i $pattern $filename]
        } else {
                set matches [exec egrep $pattern $filename]
        }
        }
        foreach line [split $matches "\n"] {
          if {$ignore_case} {
             set match [regexp -nocase -indices -- $pattern $line indices]
          } else {
             set match [regexp -indices -- $pattern $line indices]
          }
          if { $match } {
             set  first [lindex $indices 0]
             set  last  [lindex $indices 1]
             .f2.text insert end [string range $line 0 [expr {$first-1}]]
             .f2.text insert end [string range $line $first $last] "matched"
             .f2.text insert end [string range $line [expr {$last+1}] end]
             .f2.text insert end "\n"
          }
       }


       } msg ] } {
         .f2.text insert end "$msg\n"
    }
    .f2.text insert end "\n"
 }

Sarnold asks why not call string match or string first on each line before applying a regular expression?

Here is an attempt to make it faster (regexps are constants as they are stored in a proc):

Program 5

 proc searchPattern {filename pattern ignore_case} {
    catch {rename matches ""}
    catch {rename globmatches ""}
    set pattern2 *${pattern}*
    set pattern (.*)(${pattern})(.*)
    }
    if {$ignore_case} {
            set body {regexp -inline -nocase --} 
    } else {
        set body {regexp -inline --} 
    }
    proc matches line [string map {%LINE% $line} [linsert $body end $pattern %LINE%]]
    if {$ignore_case} {
        set body {string match -nocase} 
    } else {
        set body {string match} 
    }
    proc globmatches line [string map {%LINE% $line} [linsert $body end $pattern2 %LINE%]]

    set slurpSize 2000
    if { [ catch {
            set infile [open $filename "r"]
            fconfigure $infile -buffering full
            
            .f2.text insert end "$filename:\n" fn
            
            set chunk {}
            while {![eof $infile]} {
                append chunk [read $infile $slurpSize]
                if {![string length $chunk]} {
                    break
                }
                set lines [split $chunk "\n"]
                set chunk [lindex $lines end]
                
                foreach line [lrange $lines 0 end-1] {
                    # this test checks $line for a glob-style matching (via string match)
                    # you might drop it if you find it limiting pattern matching
                    if {[globmatches $line]} {
                        if {[llength [set matches [matches $line]]]} {
                            foreach {-> start matched tail} $matches break
                            .f2.text insert end $start
                            .f2.text insert end $matched "matched"
                            .f2.text insert end $tail
                            .f2.text insert end "\n"
                        }
                    }
                }
            }
            close $infile
        } msg ] } {
        .f2.text insert end "$msg\n"
    }
    .f2.text insert end "\n"
 }

Comments

...


See also: