[EKB] The ::math::statistics package in tcllib includes a linear model procedure for a single variable in which

Y = a * X + b + error

However, sometimes there is a need to do multivariate regression of the form

Y = a1 * X1 + a2 * X2 + ... + aN * XN + b + error

This page has an implementation of multivariate linear regression. (I adapted it from my own [Substantively Weighted Least Squares (SWLS)] page.)

Here's the library (in the mvlinreg namespace):

    # Copyright 2007 Eric Kemp-Benedict
    # This library may be used and modified without notification to the author.
    # It would be greatly appreciated if corrections and improvements were
    # submitted to the appropriate page on the Tcler's Wiki.
    package require Tcl 8.4
    package require math::linearalgebra 1.0
    package require math::statistics 0.1.1
    
    # mvlinreg = Multivariate Linear Regression
    namespace eval mvlinreg {
        variable epsilon 1.0e-7
        
        namespace import -force \
            ::math::linearalgebra::mkMatrix \
            ::math::linearalgebra::mkVector \
            ::math::linearalgebra::mkIdentity \
            ::math::linearalgebra::mkDiagonal \
            ::math::linearalgebra::getrow \
            ::math::linearalgebra::setrow \
            ::math::linearalgebra::getcol \
            ::math::linearalgebra::setcol \
            ::math::linearalgebra::getelem \
            ::math::linearalgebra::setelem \
            ::math::linearalgebra::dotproduct \
            ::math::linearalgebra::matmul \
            ::math::linearalgebra::add \
            ::math::linearalgebra::sub \
            ::math::linearalgebra::solveGauss
        
        # NOTE: The authors of math::linearalgebra forgot to
        #   export ::math::linearalgebra::transpose
            
        ########################################
        #
        # t-stats
        #
        ########################################
        # mvlinreg::tstat n ?alpha?
        # Returns inverse of the single-tailed t distribution
        #  given number of degrees of freedom & confidence
        # Defaults to alpha = 0.05
        # Iterates until result is within epsilon of the target
        proc tstat {n {alpha 0.05}} {
            variable epsilon
            variable tvals
            
            if [info exists tvals($n:$alpha)] {
                return $tvals($n:$alpha)
            }
            
            set deltat [expr {100 * $epsilon}]
            # For one-tailed distribution, 
            set ptarg [expr {1.000 - $alpha/2.0}]
            set maxiter 100
            
            # Initial value for t
            set t 2.0
            
            set niter 0
            while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} {
                set pstar [::math::statistics::cdf-students-t $n $t]
                set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]]
                set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]]
                
                set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}]
                
                incr niter
                if {$niter == $maxiter} {
                    error "mvlinreg::tstat: Did not converge after $niter iterations"
                    return {}
                }
            }
            
            # Cache the result to shorten the call in future
            set tvals($n:$alpha) $t
            
            return $t
        }
            
        ########################################
        #
        # Weighted Least Squares
        #
        ########################################
        # mvlinreg::wls w [list y x's] w [list y x's] ...
        # Returns:
        #   R-squared
        #   Adj R-squared
        #   coefficients of x's in fit
        #   standard errors of the coefficients
        #   95% confidence bounds for coefficients
        proc wls {args} {
            
            # Fill the matrices of x & y values, and weights
            # For n points, k coefficients
            
            # The number of points is equal to half the arguments (n weights, n points)
            set n [expr {[llength $args]/2}]
            
            set firstloop true
            # Sum up all y values to take an average
            set ysum 0
            # Add up the weights
            set wtsum 0
            # Count over rows (points) as you go
            set point 0
            foreach {wt pt} $args {
                
                # Check inputs
                if {[string is double $wt] == 0} {
                    error "mvlinreg::wls: Weight \"$wt\" is not a number"
                    return {}
                }
                
                ## -- Check dimensions, initialize
                if $firstloop {
                    # k = num of vals in pt = 1 + number of x's (because of constant)
                    set k [llength $pt]
                    if {$n <= [expr {$k + 1}]} {
                        error "mvlinreg::wls: Too few degrees of freedom: $k variables but only $n points"
                        return {}
                    }
                    set X [mkMatrix $n $k]
                    set y [mkVector $n]
                    set I_x [mkIdentity $k]
                    set I_y [mkIdentity $n]
                    
                    set firstloop false
                    
                } else {
                    # Have to have same number of x's for all points
                    if {$k != [llength $pt]} {
                        error "mvlinreg::wls: Point \"$pt\" has wrong number of values (expected $k)"
                        # Clean up
                        return {}
                    }
                }
                
                ## -- Extract values from set of points
                # Make a list of y values
                set yval [expr {double([lindex $pt 0])}]
                setelem y $point $yval
                set ysum [expr {$ysum + $wt * $yval}]
                set wtsum [expr {$wtsum + $wt}]
                # Add x-values to the x-matrix
                set xrow [lrange $pt 1 end]
                # Add the constant (value = 1.0)
                lappend xrow 1.0
                setrow X $point $xrow
                # Create list of weights & square root of weights
                lappend w [expr {double($wt)}]
                lappend sqrtw [expr {sqrt(double($wt))}]
                
                incr point
                
            }
    
            set ymean [expr {double($ysum)/$wtsum}]
            set W [mkDiagonal $w]
            set sqrtW [mkDiagonal $sqrtw]
            
            # Calculate sum os square differences for x's
            for {set r 0} {$r < $k} {incr r} {
                set xsqrsum 0.0
                set xmeansum 0.0
                # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n
                for {set t 0} {$t < $n} {incr t} {
                    set xval [getelem $X $t $r]
                    set xmeansum [expr {$xmeansum + double($xval)}]
                    set xsqrsum [expr {$xsqrsum + double($xval * $xval)}]
                }
                lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}]
            }
            
            ## -- Set up the X'WX matrix
            set XtW [matmul [::math::linearalgebra::transpose $X] $W]
            set XtWX [matmul $XtW $X]
            
            # Invert
            set M [solveGauss $XtWX $I_x]
            
            set beta [matmul $M [matmul $XtW $y]]
            
            ### -- Residuals & R-squared
            # 1) Generate list of diagonals of the hat matrix
            set H [matmul $X [matmul $M $XtW]]
            for {set i 0} {$i < $n} {incr i} {
                lappend h_ii [getelem $H $i $i]
            }
    
            set R [matmul $sqrtW [matmul [sub $I_y $H] $y]]
            set yhat [matmul $H $y]
            
            # 2) Generate list of residuals, sum of squared residuals, r-squared
            set sstot 0.0
            set ssreg 0.0
            # Note: Relying on representation of Vector as a list for y, yhat
            foreach yval $y wt $w yhatval $yhat {
                set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}]
                set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}]
            }
            set r2 [expr {double($ssreg)/$sstot}]
            set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}]
            set sumsqresid [dotproduct $R $R]
            set s2 [expr {double($sumsqresid) / double($n - $k)}]
            
            ### -- Confidence intervals for coefficients
            set tvalue [tstat [expr {$n - $k}]]
            for {set i 0} {$i < $k} {incr i} {
                set stderr [expr {sqrt($s2 * [getelem $M $i $i])}]
                set mid [lindex $beta $i]
                lappend stderrs $stderr
                lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]]
            }
    
            return [list $r2 $adjr2 $beta $stderrs $confinterval]
        }
        
        ########################################
        #
        # Ordinary (unweighted) Least Squares
        #
        ########################################
        # mvlinreg::ols [list y x's] [list y x's] ...
        # Returns:
        #   R-squared
        #   Adj R-squared
        #   coefficients of x's in fit
        #   standard errors of the coefficients
        #   95% confidence bounds for coefficients
        proc ols {args} {
            set newargs {}
            foreach pt $args {
                lappend newargs 1 $pt
            }
            return [eval wls $newargs]
        }
    }

if 0 {
Here's an example:
}

    
    ##
    ##
    ## TEST IT
    ##
    ##
    
    set data {{183.34 100. 9.43} \
    {222.67 100. 9.14} \
    {410.77 20.3 8.81} \
    {268.57 17.53 8.74} \
    {499.28 10.95 8.71} \
    {385.97 51.81 8.78} \
    {474.53 17.01 8.74} \
    {550.04 16.6 8.72} \
    {221.54 100. 9.49} \
    {284.93 49.9 9.01} \
    {324.19 23.02 9.37} \
    {364.6 16.59 8.7} \
    {338.62 20.03 9.13} \
    {543.19 13.44 8.75} \
    {356.96 25.64 8.94} \
    {296.54 18.63 8.74} \
    {336.98 50.07 8.82} \
    {398.5 15.35 8.85} \
    {216.37 47.41 9.32} \
    {325.02 14.83 8.9} \
    {315.55 9.43 8.83} \
    {321.57 76.98 8.89} \
    {443.21 51.28 8.8} \
    {493.22 13.19 8.69} \
    {554.79 19.62 8.89} \
    {527.46 63.23 8.75} \
    {490.59 10.72 8.72} \
    {389.31 14.39 8.73} \
    {218.9 8.65 8.71} \
    {315.65 7.29 8.84}}
    
    set results [eval mvlinreg::ols $data]
    
    puts "R-squared: [lindex $results 0]"
    puts "Adj R-squared: [lindex $results 1]"
    puts "Coefficients \u00B1 s.e. -- \[95% confidence interval\]:"
    foreach val [lindex $results 2] se [lindex $results 3] bounds [lindex $results 4] {
        set lb [lindex $bounds 0]
        set ub [lindex $bounds 1]
        puts "   $val \u00B1 $se -- \[$lb to $ub\]"
    }
    
if 0 {
This should give the following output:
 R-squared: 0.371918955025
 Adj R-squared: 0.325394433175
 Coefficients ± s.e. -- [95% confidence interval]:
   -0.432215837601 ± 0.744774375289 -- [-1.96036662835 to 1.09593495315]
   -250.868151309 ± 92.4723901677 -- [-440.605823342 to -61.1304792767]
   2615.78338226 ± 807.559562552 -- [958.808028331 to 4272.75873618]
}


----
[AM] (19 february 2007) You are welcome to contribute this to Tcllib :) Just say the word, and I will incorporate 
it in the package. 
----
[Category Mathematics] - [Category Statistics]