Version 8 of LZ4

Updated 2017-05-23 06:08:04 by dbohdan

LZ4 is a byte-oriented lossless compression algorithm optimized for speed. Its low-level implementations provide extremely fast decompression compared to the competitors.

Tcl implementation

The following pure Tcl implementation of the LZ4 decoder has been tested in Tcl 8.5, Tcl 8.6 and Jim Tcl, but currently only works in the latter if Jim Tcl was compiled without UTF-8 support (see README.utf-8 for the reason). This version doesn't verify checksums, so beware data corruption. You may wish to checksum the data before compression and verify its integrity after decompression with one of the checksum algorithms in Tcllib like MD5 or SHA256 or Tcl 8.6's built-in zlib crc32 command.

Download with wiki-reaper: wiki-reaper 48789 0 > lz4-0.1.1.tm

Code

# A pure Tcl LZ4 decoder.
# Copyright (c) 2017 dbohdan
# License: MIT
# This decoder implements version 1.5 of the LZ4 frame spec. It does not verify
# checksums. It is known to work in Tcl 8.5, Tcl 8.6 and non-UTF-8 builds of
# Jim Tcl 0.74-0.77.
namespace eval ::lz4 {
    variable version 0.1.1
}

proc ::lz4::assert-equal {actual expected} {
    if {$actual ne $expected} {
        if {[string length $actual] > 200} {
            set actual [string range $actual 0 199]...
        }
        if {[string length $expected] > 200} {
            set expected [string range $expected 0 199]...
        }
        error "expected \"$expected\",\n\
               but got \"$actual\""
    }
}

proc ::lz4::decode-block {data ptr endPtr window} {
    set result {}
    while 1 {
        if {![binary scan $data "@$ptr cu" token]} {
            error "data truncated"
        }
        incr ptr 1
        set litLen   [expr {($token >> 4) & 0x0F}]
        set matchLen [expr {$token & 0x0F}]
        if {$litLen == 15} {
            while 1 {
                if {![binary scan $data "@$ptr cu" byte]} {
                    error "data truncated"
                }
                incr ptr 1
                incr litLen $byte
                if {$byte < 255} break
            }
        }
        if {![binary scan $data "@$ptr a$litLen" literals]} {
            error "data truncated"
        }
        incr ptr $litLen
        append window $literals
        append result $literals
        # The last sequence is incomplete.
        if {$ptr < $endPtr} {
            if {![binary scan $data "@$ptr su" offset]} {
                error "data truncated"
            }
            incr ptr 2
            if {$matchLen == 15} {
                while 1 {
                    if {![binary scan $data "@$ptr cu" byte]} {
                        error "data truncated"
                    }
                    incr ptr 1
                    incr matchLen $byte
                    if {$byte < 255} break
                }
            }
            incr matchLen 4
            incr offset -1
            set endOffset [expr {
                $offset - $matchLen > 0 ? $offset - $matchLen : 0
            }]
            set overlapLen [expr {
                $offset - $matchLen > 0 ? 0 : $matchLen - $offset
            }]
            set match [string range $window end-$offset end-$endOffset]
            set matchWithOverlap [string range [string repeat $match [expr {
                ($overlapLen / ($offset - $endOffset + 1)) + 2
            }]] 0 $matchLen-1]
            append window $matchWithOverlap
            append result $matchWithOverlap
        }
        if {$ptr == $endPtr} break
        if {$ptr > $endPtr} {
            error "read beyond block end"
        }
    }
    return [list $ptr $window $result]
}

proc ::lz4::decode-frame {data ptr} {
    # Decode and validate the header.
    if {![binary scan $data "@$ptr i" magic]} {
        error "data truncated"
    }
    incr ptr 4
    if {$magic == 0x184D2204} {
        # Normal frame.
    } elseif {(0x184D2A50 <= $magic) && ($magic <= 0x184D2A5F)} {
        # Skippable frame.
        if {![binary scan $data "@$ptr iu" frameSize]} {
            error "data truncated"
        }
        incr ptr 4
        incr ptr $frameSize
        return [list $ptr {}]
    } else {
        error "unexpected magic number: $magic"
    }
    set flags {}
    if {![binary scan $data "@$ptr cu cu" flags blockDescr]} {
        error "data truncated"
    }
    incr ptr 2
    set flagsReserved      [expr {($flags & 0b00000011) == 0}]
    set hasContentChecksum [expr {($flags & 0b00000100) == 0b00000100}]
    set hasContentSize     [expr {($flags & 0b00001000) == 0b00001000}]
    set hasBlockChecksums  [expr {($flags & 0b00010000) == 0b00010000}]
    set blockIndep         [expr {($flags & 0b00100000) == 0b00100000}]
    set version            [expr {($flags & 0b11000000) == 0b01000000}]
    if {!$flagsReserved} {
        error "FLG reserved bits aren't zero"
    }
    if {!$version} {
        error "frame version isn't \"01\""
    }
    set blockDescrReserved [expr {($blockDescr & 0b10001111) == 0}]
    set blockMaxSize       [expr {$blockDescr >> 4}]
    if {!$blockDescrReserved} {
        error "BD reserved bits aren't zero"
    }
    if {$blockMaxSize < 4} {
        error "invalid block maximum size ($blockMaxSize < 4)"
    }
    if {$hasContentSize} {
        if {![binary scan $data "@$ptr wu" uncompressedSize]} {
            error "data truncated"
        }
        incr ptr 8
    }
    if {![binary scan $data "@$ptr cu" headerChecksum]} {
        error "data truncated"
    }
    incr ptr 1

    # Decode the blocks.
    set window {}
    while 1 {
        if {![binary scan $data "@$ptr iu" blockSize]} {
            error "data truncated"
        }
        incr ptr 4
        set compressed [expr {!($blockSize >> 31)}]
        set blockSize [expr {$blockSize & 0x7fffffff}] ;# Zero the highest bit.
        if {$blockSize == 0} break

        if {$compressed} {
            lassign [decode-block $data \
                                  $ptr \
                                  [expr {$ptr + $blockSize}] $window] \
                    ptr \
                    window \
                    decodedBlock
            if {$blockIndep} {
                set window {}
            } else {
                set window [string range $window end-0xFFFF end]
            }
        } else {
            if {![binary scan $data "@$ptr a$blockSize" decodedBlock]} {
                error "data truncated"
            }
            incr ptr $blockSize
        }
        append result $decodedBlock
    }

    # Decode the checksum.
    if {$hasContentChecksum} {
        if {![binary scan $data "@$ptr iu" contentChecksum]} {
            error "data truncated"
        }
        incr ptr 4
    }

    return [list $ptr $result]
}

proc ::lz4::decode data {
    set ptr 0
    set result {}
    while 1 {
        lassign [decode-frame $data $ptr] ptr frame
        append result $frame
        break
    }
    return $result
}

proc ::lz4::file-test path {
    if {![file exists $path]} {
        puts stderr "can't find file \"$path\" -- skipping test"
        return
    }
    # Can't use -ignorestderr because of Jim Tcl compatiblity.
    if {[catch {exec lz4 --version 2>@1}]} {
        puts stderr "can't run lz4 -- skipping test"
        return
    }
    set ch [open $path rb]
    set data [read $ch]
    close $ch
    set ch [open [list |lz4 -c -9 $path]]
    fconfigure $ch -translation binary
    set dataCompressed [read $ch]
    close $ch
    assert-equal [decode $dataCompressed] $data
}

proc ::lz4::test {} {
    set hello {Hello, World!}
    set helloCompressed [join {
        \x04\x22 \x4d\x18 \x64\x40 \xa7\x0d \x00\x00 \x80\x48 \x65\x6c \x6c\x6f
        \x2c\x20 \x57\x6f \x72\x6c \x64\x21 \x00\x00 \x00\x00 \xe8\x43 \xd0\x9e
    } {}]

    set seq {}
    for {set i 0} {$i < 4} {incr i} {
        append seq [string repeat $i 64]
    }
    set seqCompressed [join {
        \x04\x22 \x4d\x18 \x64\x40 \xa7\x1a \x00\x00 \x00\x1f \x30\x01 \x00\x2c
        \x1f\x31 \x01\x00 \x2c\x1f \x32\x01 \x00\x2c \x1f\x33 \x01\x00 \x27\x50
        \x33\x33 \x33\x33 \x33\x00 \x00\x00 \x00\x80 \xf5\x97 \x31
    } {}]

    set abc "abcabcabc 123123123 abcabcabc 123123123 abcabcabc123123123\n"
    set abcCompressed [join {
        \x04 \x22 \x4d \x18 \x64 \x40 \xa7 \x1d \x00 \x00 \x00 \x32
        \x61 \x62 \x63 \x03 \x00 \x42 \x20 \x31 \x32 \x33 \x03 \x00
        \x1f \x20 \x14 \x00 \x0a \xa0 \x31 \x32 \x33 \x31 \x32 \x33
        \x31 \x32 \x33 \x0a \x00 \x00 \x00 \x00 \xc3 \x67 \x9d \xbf
    } {}]

    set blah "1blah2HELLOblah3blah4blah foo bar blah !!!!\
              213218372132-------------------"
    set blahCompressed [join {
        \x04 \x22 \x4d \x18 \x64 \x40 \xa7 \x3c \x00 \x00 \x00 \xb0
        \x31 \x62 \x6c \x61 \x68 \x32 \x48 \x45 \x4c \x4c \x4f \x0a
        \x00 \x10 \x33 \x05 \x00 \x10 \x34 \x05 \x00 \x91 \x20 \x66
        \x6f \x6f \x20 \x62 \x61 \x72 \x20 \x0d \x00 \xd0 \x21 \x21
        \x21 \x21 \x20 \x32 \x31 \x33 \x32 \x31 \x38 \x33 \x37 \x08
        \x00 \x19 \x2d \x01 \x00 \x50 \x2d \x2d \x2d \x2d \x2d \x00
        \x00 \x00 \x00 \xa4 \xeb \xf6 \xac
    } {}]

    puts stderr "running tests"
    puts stderr "--- hello"
    assert-equal [decode $helloCompressed] $hello
    puts stderr "--- seq"
    assert-equal [decode $seqCompressed]   $seq
    puts stderr "--- abc"
    assert-equal [decode $abcCompressed]   $abc
    puts stderr "--- blah"
    assert-equal [decode $blahCompressed]  $blah
    puts stderr "--- empty"
    catch {decode {}} err
    assert-equal $err "data truncated"
    puts stderr "--- passwd"
    file-test /etc/passwd
    puts stderr "--- sh (binary)"
    file-test /bin/sh
}

# If this is the main script...
if {[info exists argv0] && ([file tail [info script]] eq [file tail $argv0])} {
    if {$argv eq {--test}} {
        ::lz4::test
    } elseif {[lindex $argv 0] ne {}} {
        lassign $argv filename
        if {$filename eq {-}} {
            set ch stdin
        } else {
            set ch [open $filename rb]
        }
        set data [read $ch]
        close $ch
        fconfigure stdout -translation binary
        puts -nonewline [::lz4::decode $data]
    } else {
        puts "usage: [info script] (--test | - | filename)"
    }
}

Performance comparison

> echo 'puts [info patchlevel]' | tclsh
8.6.5
> lz4 -k -9 ffmpeg.exe
Compressed 40859136 bytes into 18846155 bytes ==> 46.12%
> time lz4 -d -c ffmpeg.exe.lz4 > /dev/null
0.03user 0.03system 0:00.08elapsed 76%CPU
> time tclsh lz4.tcl ffmpeg.exe.lz4 > /dev/null
18.59user 0.32system 0:19.05elapsed 99%CPU

See also