text_replacer.tcl

Summary

rpremuz (2009-07-30)

text_replacer.tcl replaces character strings in the specified plain text files according to the given mappings.

Code

#!/bin/sh
# the next line restarts using tclsh \
exec tclsh "$0" "$@"

set usageMsg "text_replacer.tcl ver. 1.1
This Tcl script replaces character strings in the specified plain text files
according to the given mappings.
Usage: text_replacer.tcl ?options? mappingsFile ?inputFile...?
Try 'text_replacer.tcl -h' for more help."

set helpMsg "text_replacer.tcl ver. 1.1
This Tcl script replaces character strings in the specified plain text files
according to the given mappings.
Usage: text_replacer.tcl ?options? mappingsFile ?inputFile...?

mappingsFile
  Path of the text file that defines the string replacements by pairs of
  consecutive lines that have the following format:
  o:<old_string>
  n:<new_string>
  Each exact match of character string <old_string>, which must not be empty,
  will be replaced by the character string <new_string>.
  A line is ignored if it contains only whitespaces or begins with a hash (#).
  The script sorts the mappings in decreasing order (largest items first) based
  on old strings.

inputFile
  Path of the text file in which strings are to be replaced. If no files are
  specified, stdin is filtered to stdout.

Options:
-encoding <encoding>
  Specifies the encoding of the text in input, mappings and output. The
  'encoding names' Tcl command gives the accepted encodings. The default
  encoding is the platform- and locale-dependent system encoding. (A note for
  MS Windows command shell: if stdin or stdout corresponds to a terminal, its
  default encoding is unicode. Otherwise it is the system encoding.)

-translation <translation>
  Specifies the translation mode for end-of-line characters in input, mappings
  and output. The accepted values are:
    lf    The EOL is a single newline (linefeed) character. This is typically
          used on Unix-like platforms. 
    crlf  The EOL is a carriage return character followed by a linefeed
          character. This is typically used on MS DOS/Windows platforms.
    auto  This is the default mode. On input EOL may be lf or crlf. On output
          EOL is platform specific (lf for Unix, crlf for Windows).

-t        Prints the execution time of the main procedure to the stderr.

-g        Perform globbing on the input file names. This may be useful if the
          command shell does not provide globbing by itself. (A note for MS
          Windows command shell: Since the backslash character has a special
          meaning in glob patterns, paths must be specified using / as the
          directory separator, e.g. a local path: C:/foo/bar/*.txt and a UNC
          path: //server/share/*.txt)

-h        Prints this help to stdout.
--        Specifies the end of options.

The script first processes the mappings file to get the mappings. Then for each
specified input file it reads its lines and writes them to a temporary output
file making the replacements defined by the mappings. If the input file is a
symbolic link, the file that the link points to is opened.

If any real changes of input text have been made during the processing, the
input file is renamed to inputfile.old and the temporary output file is
renamed to inputfile. Otherwise, the temporary output file is deleted.

If an error occurs during the reading or writing, an error message is written
to stderr, the input file is not changed and the output file is deleted.

The script prints the debug messages to stderr if the DEBUG environment
variable is set to 1.

Exit code:
  0 - No errors during the processing of input files (or stdin).
  1 - An error occurred during the processing of input files (or stdin).
"

#-------------------------------------------------------------------------------
# The script was successfully tested on the following systems:
# Debian GNU/Linux 7      with Tcl 8.5.11 and Tcllib 1.14
#                         (packages tcl8.5 and tcllib)
# Debian GNU/Linux 8      with Tcl 8.6.2  and Tcllib 1.16
#                         (packages tcl8.6 and tcllib)
# Microsoft Windows 7/8.1 with ActiveState ActiveTcl 8.6.4.1
#                         (includes Tcllib 1.12)
#-------------------------------------------------------------------------------
# Copyright 2009-2016 Robert Premuz <rpr.nospam(at)gmail.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# The GNU General Public License Version 3 can be found at
# <http://www.gnu.org/licenses/>.
#-------------------------------------------------------------------------------
# Version history and author(s):
#
# Version 1.0 2009-01-08 by Robert Premuz <rpr.nospam(at)gmail.com>
# Initial release.
#
# Version 1.1 2016-01-28 by Robert Premuz <rpr.nospam(at)gmail.com>
# Option -g added.
################################################################################

package require Tcl 8.5
package require fileutil ;# ::fileutil::fullnormalize, ::fileutil::tempfile
package require cmdline ;# ::cmdline::getopt


################################################################################
# proc_tracer
# The procedure for tracing procedures.
#
proc proc_tracer {name command op} {
  puts stderr "$name >> $command"
}


################################################################################
# var_tracer
# The procedure for tracing variables.
#
proc var_tracer {varname args} {
  upvar 1 $varname var

  puts stderr ">>> $varname updated to {$var}"
}


################################################################################
# replace_strings
# This procedure reads the input file, makes the replacements defined by the
# mappings and writes the result to the output file.
#
# Arguments:
#   inCh  - ID of the input channel.
#   outCh - ID of the output channel.
#   errVar - The name of the variable, in the context of the calling procedure,
#            for storing an error message.
#   mappingsVar - The name of the variable, in the context of the calling
#                 procedure, that contains the list of string mappings.
#
# Returns:
#   noChange - The output contains no real changes of input text.
#   changed  - The output contains some real changes of input text.
#   inputError  - An error occurred while reading the input file.
#   outputError - An error occurred while writing the output file.
#
proc replace_strings {inCh outCh mappingsVar errVar} {
  upvar 1 $mappingsVar mappings
  upvar 1 $errVar err
  global env

  # If the DEBUG environment variable is set to "1", trace the variables.
  if {[info exists env(DEBUG)] && ($env(DEBUG) eq "1")} {
    foreach var {returnCode eof inLine inLineLen err outLine} {
      trace add variable $var write "var_tracer $var"
    }
  }

  # Read lines of the input file and process them.
  for {set returnCode noChange; set eof 0} {! $eof} {} {
    if {[catch { set inLineLen [gets $inCh inLine] } err]} {
      return inputError
    } elseif {$inLineLen == -1} {
      set eof 1
    } else {
      # Replace old strings with the new strings.
      set outLine [string map $mappings $inLine]

      # Check if there is a change after the replacement.
      if {$inLine ne $outLine} {
        set returnCode changed
      }

      # Write the output line.
      if {[catch { puts $outCh $outLine } err]} {
        return outputError
      }
    }
  }
  return $returnCode
}


################################################################################
# main
# This is the main procedure that opens and closes files and prints the error
# messages. It calls the replace_strings procedure for each input file.
#
# Returns:
#   0 - Procedure finished without errors.
#   1 - An error occurred. The error messages are written to stderr.
#
proc main {} {
  global usageMsg helpMsg argc argv0 argv env showExecutionTime

  # If the DEBUG environment variable is set to "1", trace the variables.
  if {[info exists env(DEBUG)] && ($env(DEBUG) eq "1")} {
    foreach var {argv0 argv argc returnCode tmpFilePrefix oldFileExt
      cmdlineOptions origArgv showExecutionTime printHelp globFileNames
      textEncoding eolTranslation option value result mapFilePath lineN mappings
      inLineLen inLine err oldStr newStr inputFiles nInputFiles i inFilePath
      renamedInFilePath outFilePath
    } {
      trace add variable $var write "var_tracer $var"
    }
  }

  set returnCode 0 ;# Exit code: 0 for success, 1 for errors.
  set tmpFilePrefix "tr_" ;# Prefix for temporary output file.
  set oldFileExt "old" ;# Extension for renaming the input file.
  set argv0 [file nativename [file normalize $argv0]] ;# The script path.

  # Process the command line options using ::cmdline::getopt.
  set cmdlineOptions "t h g encoding.arg translation.arg"
  set origArgv $argv
  set printHelp 0
  set showExecutionTime 0
  set globFileNames 0
  set textEncoding ""
  set eolTranslation ""
  while {[set result [::cmdline::getopt argv $cmdlineOptions option value]]} {
    if {$result == -1} { # Illegal option; value contains error message.
      puts stderr \
        "$argv0: $value.\nThe argument list was:\n$origArgv\n$usageMsg"
      return 1
    }
    # The option values are not checked against the values specified in the
    # usage message. We suppose that the user knows what he is doing.
    # If an illegal value is specified, the Tcl error will occur.
    switch $option {
      t           { set showExecutionTime 1 }
      h           { set printHelp 1 }
      g           { set globFileNames 1 }
      encoding    { set textEncoding $value }
      translation { set eolTranslation $value }
      default {
        error "::cmdline::getopt returned unexpected option: \"$option\""
      }
    }
  }
  set argc [llength $argv]

  # If -h option was specified, just print the help message to stdin and return.
  if {$printHelp} {
    puts $helpMsg
    return 0
  }

  # Files are read and written using the specified text encoding.
  # If the encoding is not specified, the system encoding is used.
  # The specified text encoding is also used on stdin and stdout.
  # But if encoding is not specified, stdin and stdout retain their default
  # encoding.
  # (On Windows platform the default encoding for stdin and stdout is "unicode"
  # if they correspond to a terminal. Otherwise the default encoding is the
  # system encoding.)
  if {$textEncoding eq ""} {
    set textEncoding [encoding system]
  } else {
    fconfigure stdin -encoding $textEncoding
    fconfigure stdout -encoding $textEncoding
  }

  # Files are read and written using the specified EOL translation mode.
  # If it is not specified, the default mode (auto) is used.
  if {$eolTranslation eq ""} {
    set eolTranslation auto
  }
  fconfigure stdin -translation $eolTranslation
  fconfigure stdout -translation $eolTranslation

  if {$argc == 0} {
    puts stderr "$argv0: mappings file not specified.\n$usageMsg"
    return 1
  }

  # Open the mappings file.
  set mapFilePath [lindex $argv 0]
  if {[catch { set mapCh [open $mapFilePath RDONLY] } err]} {
    puts stderr \
      "$argv0: error while opening mappings file \"$mapFilePath\": $err"
    return 1
  }
  fconfigure $mapCh -encoding $textEncoding -translation $eolTranslation

  # Read lines from the mappings file and process them to create an array of
  # mappings.
  set lineN 0
  while 1 {
    if {[catch { set inLineLen [gets $mapCh inLine] } err]} {
      puts stderr \
        "$argv0: error while reading mappings file \"$mapFilePath\": $err"
      return 1
    } elseif {$inLineLen == -1} { ;# EOF
      if {[info exists oldStr]} {
        puts stderr \
          "$argv0: syntax error in mappings file \"$mapFilePath\" line $lineN:\
          last old string not followed by new string"
        return 1
      }
      break
    }
    incr lineN

    # Parse the line of the mappings file and check the syntax.
    switch -regexp $inLine {
      "^o:.*" { # an old string
        if {[info exists oldStr]} {
          puts stderr \
            "$argv0: syntax error in mappings file \"$mapFilePath\"\
            line $lineN: consecutive old strings"
          return 1
        }
        set oldStr [string range $inLine 2 end]
        if {$oldStr eq ""} {
          puts stderr \
            "$argv0: syntax error in mappings file \"$mapFilePath\"\
            line $lineN: old string is empty"
          return 1
        }
      }

      "^n:.*" { # a new string
        if {! [info exists oldStr]} {
          puts stderr \
            "$argv0: syntax error in mappings file \"$mapFilePath\"\
            line $lineN: new string not preceded by old string"
          return 1
        }
        set newStr [string range $inLine 2 end]
      }

      "^#.*" { # a comment
      }
      "^[[:space:]]*$" { # an empty line
      }

      default {
        puts stderr \
          "$argv0: syntax error in mappings file \"$mapFilePath\" line $lineN:\
          unrecognized line"
        return 1
      }
    }
    if {[info exists oldStr] && [info exists newStr]} {
      # Add new mapping only if oldStr and newStr differ.
      if {$oldStr ne $newStr} {
        set mapArray($oldStr) $newStr
      }
      unset oldStr newStr
    }
  }

  # Convert the mappings array to a list in the following form:
  # { {old_string new_string} ... }
  # which enables sorting of the mappings in decreasing order ("largest" items
  # first) based on old strings.
  set mappings ""
  foreach {oldStr newStr} [array get mapArray] {
    lappend mappings [list $oldStr $newStr]
  }
  array unset mapArray
  set mappings [lsort -decreasing -index 0 $mappings]

  # Format the mappings list as required by the "string map" command.
  set mappings [join $mappings]

  # The rest of arguments are input files.
  set inputFiles [lrange $argv 1 end]
  set nInputFiles [llength $inputFiles]

  # If no input files specified, filter stdin to stdout and then return.
  if {! $nInputFiles} {
    set result [replace_strings stdin stdout mappings err]
    switch $result {
      noChange -
      changed { return 0 }
      inputError { puts stderr "$argv0: $err"; return 1 }
      outputError { puts stderr "$argv0: $err"; return 1 }
      default {
        error "replace_strings returned unexpected result: \"$result\""
      }
    }
  }

  if {! [llength $mappings]} {
    # No actual mappings defined, nothing to do with the input files.
    # This is not considered an error.
    return 0
  }

  # If -g option was specified, perform globbing on the input file names.
  if {$globFileNames} {
    set i {}
    foreach inFilePath  $inputFiles {
      if {[catch { set i [concat $i [glob $inFilePath]]} err]} {
        puts stderr "$argv0: $err"
        return 1
      }
    }
    set inputFiles $i
    set nInputFiles [llength $inputFiles]
  }
  # Process all input files.
  for {set i 0 } {$i < $nInputFiles} {incr i} {
    # Open the input file. If the input file path contains symbolic links, it is
    # resolved to the real file path.
    set inFilePath [::fileutil::fullnormalize [lindex $inputFiles $i]]
    set inFilePath [file nativename $inFilePath]
    set renamedInFilePath $inFilePath.$oldFileExt
    if {[catch { set inFileCh [open $inFilePath RDONLY] } err]} {
      puts stderr \
        "$argv0: error while opening input file \"$inFilePath\": $err"
      continue
    }
    fconfigure $inFileCh -encoding $textEncoding -translation $eolTranslation

    # Open a temporary output file.
    set outFilePath [file nativename [::fileutil::tempfile $tmpFilePrefix]]
    if {[catch { set outFileCh [open $outFilePath WRONLY] } err]} {
      puts stderr "$argv0: error while opening temporary output file\
        \"$outFilePath\" for writing: $err"
      continue
    }
    fconfigure $outFileCh -encoding $textEncoding -translation $eolTranslation

    # Filter the input file to the temporary output file.
    set result [replace_strings $inFileCh $outFileCh mappings err]
    close $inFileCh

    switch $result {
      noChange { # The output contains no real changes of input text.
         catch { close $outFileCh ; file delete -- $outFilePath }
      }

      changed { # The output contains some real changes of input text.
        if { [catch {close $outFileCh} err] } { # Close output file.
          puts stderr "$argv0: error while closing temporary output file\
            \"$outFilePath\": $err"
          catch {file delete -- $outFilePath}
          set returnCode 1
        } else {
          # Rename input file.
          if {
            [catch {file rename -force -- $inFilePath $renamedInFilePath } err]
          } {
            puts stderr "$argv0: couldn't rename input file \"$inFilePath\" to\
              \"$renamedInFilePath\": $err. Input file not changed.\
                Deleting the temporary output file."
            catch {file delete -- $outFilePath}
            set returnCode 1
          } else {
            # Rename output file to the input file path.
            if {[catch {file rename -force -- $outFilePath $inFilePath} err]} {
              puts stderr "$argv0: couldn't rename temporary output file\
                \"$outFilePath\" to \"$inFilePath\": $err.\
                Deleting the temporary output file."
              catch {file delete -- $outFilePath}

              # Try to restore the original input file name.
              if { [catch \
                {file rename -force -- $renamedInFilePath $inFilePath} err]
              } {
                puts stderr "$argv0: couldn't rename original input file\
                \"$renamedInFilePath\" back to \"$inFilePath\": $err"
              }
              set returnCode 1
            }
          }
        }
      }

      inputError {
        puts stderr \
          "$argv0: error while reading input file \"$inFilePath\": $err.\
          Deleting temporary output file."
        catch { close $outFileCh ; file delete -- $outFilePath }
        set returnCode 1
      }

      outputError {
        puts stderr "$argv0: error while writing to temporary output file\
          \"$outFilePath\": $err. Deleting the file."
        catch { close $outFileCh ; file delete -- $outFilePath }
        set returnCode 1
      }

      default {
        error "replace_strings returned unexpected result: \"$result\""
      }
    }
  }
  return $returnCode
}


# If the DEBUG environment variable is set to "1", trace the procedures.
if {[info exists env(DEBUG)] && ($env(DEBUG) eq "1")} {
  trace add execution main enterstep {proc_tracer main}
  trace add execution replace_strings enterstep {proc_tracer replace_strings}
}

# Call the main procedure and catch any unexpected errors.
if {[catch {
    set executionTime [time {set returnCode [main]}]
    set executionTime [lrange $executionTime 0 1]
    if {[info exists env(DEBUG)] && ($env(DEBUG) eq "1")} {
      puts stderr ">>> Execution time: $executionTime"
    } elseif {$showExecutionTime} {
      puts stderr "$argv0: Execution time: $executionTime"
    }
    exit $returnCode
  }]} {
  puts stderr "$argv0: Tcl error occurred:\n$errorInfo"
  exit 1
}
# vim:columns=80:tw=80

Comments

Use this space for discussion and improvements.