Multivariate Linear Regression

EKB The ::math::statistics package in tcllib includes a linear model procedure for a single variable in which

Y = a * X + b + error

However, sometimes there is a need to do multivariate regression of the form

Y = a1 * X1 + a2 * X2 + ... + aN * XN + b + error

This page has an implementation of multivariate linear regression. (I adapted it from my own Substantively Weighted Least Squares (SWLS) page.)

Here's the library (in the mvlinreg namespace):

# Copyright 2007 Eric Kemp-Benedict
# This library may be used and modified without notification to the author.
# It would be greatly appreciated if corrections and improvements were
# submitted to the appropriate page on the Tcler's Wiki.
package require Tcl 8.4
package require math::linearalgebra 1.0
package require math::statistics 0.1.1

# mvlinreg = Multivariate Linear Regression
namespace eval mvlinreg {
    variable epsilon 1.0e-7
    
    namespace import -force \
        ::math::linearalgebra::mkMatrix \
        ::math::linearalgebra::mkVector \
        ::math::linearalgebra::mkIdentity \
        ::math::linearalgebra::mkDiagonal \
        ::math::linearalgebra::getrow \
        ::math::linearalgebra::setrow \
        ::math::linearalgebra::getcol \
        ::math::linearalgebra::setcol \
        ::math::linearalgebra::getelem \
        ::math::linearalgebra::setelem \
        ::math::linearalgebra::dotproduct \
        ::math::linearalgebra::matmul \
        ::math::linearalgebra::add \
        ::math::linearalgebra::sub \
        ::math::linearalgebra::solveGauss
    
    # NOTE: The authors of math::linearalgebra forgot to
    #   export ::math::linearalgebra::transpose
        
    ########################################
    #
    # t-stats
    #
    ########################################
    # mvlinreg::tstat n ?alpha?
    # Returns inverse of the single-tailed t distribution
    #  given number of degrees of freedom & confidence
    # Defaults to alpha = 0.05
    # Iterates until result is within epsilon of the target
    proc tstat {n {alpha 0.05}} {
        variable epsilon
        variable tvals
        
        if [info exists tvals($n:$alpha)] {
            return $tvals($n:$alpha)
        }
        
        set deltat [expr {100 * $epsilon}]
        # For one-tailed distribution, 
        set ptarg [expr {1.000 - $alpha/2.0}]
        set maxiter 100
        
        # Initial value for t
        set t 2.0
        
        set niter 0
        while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} {
            set pstar [::math::statistics::cdf-students-t $n $t]
            set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]]
            set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]]
            
            set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}]
            
            incr niter
            if {$niter == $maxiter} {
                error "mvlinreg::tstat: Did not converge after $niter iterations"
                return {}
            }
        }
        
        # Cache the result to shorten the call in future
        set tvals($n:$alpha) $t
        
        return $t
    }
        
    ########################################
    #
    # Weighted Least Squares
    #
    ########################################
    # mvlinreg::wls w [list y x's] w [list y x's] ...
    # Returns:
    #   R-squared
    #   Adj R-squared
    #   coefficients of x's in fit
    #   standard errors of the coefficients
    #   95% confidence bounds for coefficients
    proc wls {args} {
        
        # Fill the matrices of x & y values, and weights
        # For n points, k coefficients
        
        # The number of points is equal to half the arguments (n weights, n points)
        set n [expr {[llength $args]/2}]
        
        set firstloop true
        # Sum up all y values to take an average
        set ysum 0
        # Add up the weights
        set wtsum 0
        # Count over rows (points) as you go
        set point 0
        foreach {wt pt} $args {
            
            # Check inputs
            if {[string is double $wt] == 0} {
                error "mvlinreg::wls: Weight \"$wt\" is not a number"
                return {}
            }
            
            ## -- Check dimensions, initialize
            if $firstloop {
                # k = num of vals in pt = 1 + number of x's (because of constant)
                set k [llength $pt]
                if {$n <= [expr {$k + 1}]} {
                    error "mvlinreg::wls: Too few degrees of freedom: $k variables but only $n points"
                    return {}
                }
                set X [mkMatrix $n $k]
                set y [mkVector $n]
                set I_x [mkIdentity $k]
                set I_y [mkIdentity $n]
                
                set firstloop false
                
            } else {
                # Have to have same number of x's for all points
                if {$k != [llength $pt]} {
                    error "mvlinreg::wls: Point \"$pt\" has wrong number of values (expected $k)"
                    # Clean up
                    return {}
                }
            }
            
            ## -- Extract values from set of points
            # Make a list of y values
            set yval [expr {double([lindex $pt 0])}]
            setelem y $point $yval
            set ysum [expr {$ysum + $wt * $yval}]
            set wtsum [expr {$wtsum + $wt}]
            # Add x-values to the x-matrix
            set xrow [lrange $pt 1 end]
            # Add the constant (value = 1.0)
            lappend xrow 1.0
            setrow X $point $xrow
            # Create list of weights & square root of weights
            lappend w [expr {double($wt)}]
            lappend sqrtw [expr {sqrt(double($wt))}]
            
            incr point
            
        }

        set ymean [expr {double($ysum)/$wtsum}]
        set W [mkDiagonal $w]
        set sqrtW [mkDiagonal $sqrtw]
        
        # Calculate sum os square differences for x's
        for {set r 0} {$r < $k} {incr r} {
            set xsqrsum 0.0
            set xmeansum 0.0
            # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n
            for {set t 0} {$t < $n} {incr t} {
                set xval [getelem $X $t $r]
                set xmeansum [expr {$xmeansum + double($xval)}]
                set xsqrsum [expr {$xsqrsum + double($xval * $xval)}]
            }
            lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}]
        }
        
        ## -- Set up the X'WX matrix
        set XtW [matmul [::math::linearalgebra::transpose $X] $W]
        set XtWX [matmul $XtW $X]
        
        # Invert
        set M [solveGauss $XtWX $I_x]
        
        set beta [matmul $M [matmul $XtW $y]]
        
        ### -- Residuals & R-squared
        # 1) Generate list of diagonals of the hat matrix
        set H [matmul $X [matmul $M $XtW]]
        for {set i 0} {$i < $n} {incr i} {
            lappend h_ii [getelem $H $i $i]
        }

        set R [matmul $sqrtW [matmul [sub $I_y $H] $y]]
        set yhat [matmul $H $y]
        
        # 2) Generate list of residuals, sum of squared residuals, r-squared
        set sstot 0.0
        set ssreg 0.0
        # Note: Relying on representation of Vector as a list for y, yhat
        foreach yval $y wt $w yhatval $yhat {
            set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}]
            set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}]
        }
        set r2 [expr {double($ssreg)/$sstot}]
        set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}]
        set sumsqresid [dotproduct $R $R]
        set s2 [expr {double($sumsqresid) / double($n - $k)}]
        
        ### -- Confidence intervals for coefficients
        set tvalue [tstat [expr {$n - $k}]]
        for {set i 0} {$i < $k} {incr i} {
            set stderr [expr {sqrt($s2 * [getelem $M $i $i])}]
            set mid [lindex $beta $i]
            lappend stderrs $stderr
            lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]]
        }

        return [list $r2 $adjr2 $beta $stderrs $confinterval]
    }
    
    ########################################
    #
    # Ordinary (unweighted) Least Squares
    #
    ########################################
    # mvlinreg::ols [list y x's] [list y x's] ...
    # Returns:
    #   R-squared
    #   Adj R-squared
    #   coefficients of x's in fit
    #   standard errors of the coefficients
    #   95% confidence bounds for coefficients
    proc ols {args} {
        set newargs {}
        foreach pt $args {
            lappend newargs 1 $pt
        }
        return [eval wls $newargs]
    }
}

if 0 {
Here's an example:
}


##
##
## TEST IT
##
##

set data {{183.34 100. 9.43} \
{222.67 100. 9.14} \
{410.77 20.3 8.81} \
{268.57 17.53 8.74} \
{499.28 10.95 8.71} \
{385.97 51.81 8.78} \
{474.53 17.01 8.74} \
{550.04 16.6 8.72} \
{221.54 100. 9.49} \
{284.93 49.9 9.01} \
{324.19 23.02 9.37} \
{364.6 16.59 8.7} \
{338.62 20.03 9.13} \
{543.19 13.44 8.75} \
{356.96 25.64 8.94} \
{296.54 18.63 8.74} \
{336.98 50.07 8.82} \
{398.5 15.35 8.85} \
{216.37 47.41 9.32} \
{325.02 14.83 8.9} \
{315.55 9.43 8.83} \
{321.57 76.98 8.89} \
{443.21 51.28 8.8} \
{493.22 13.19 8.69} \
{554.79 19.62 8.89} \
{527.46 63.23 8.75} \
{490.59 10.72 8.72} \
{389.31 14.39 8.73} \
{218.9 8.65 8.71} \
{315.65 7.29 8.84}}

set results [eval mvlinreg::ols $data]

puts "R-squared: [lindex $results 0]"
puts "Adj R-squared: [lindex $results 1]"
puts "Coefficients \u00B1 s.e. -- \[95% confidence interval\]:"
foreach val [lindex $results 2] se [lindex $results 3] bounds [lindex $results 4] {
    set lb [lindex $bounds 0]
    set ub [lindex $bounds 1]
    puts "   $val \u00B1 $se -- \[$lb to $ub\]"
}

if 0 {
This should give the following output:
 R-squared: 0.371918955025
 Adj R-squared: 0.325394433175
 Coefficients ± s.e. -- [95% confidence interval]:
   -0.432215837601 ± 0.744774375289 -- [-1.96036662835 to 1.09593495315]
   -250.868151309 ± 92.4723901677 -- [-440.605823342 to -61.1304792767]
   2615.78338226 ± 807.559562552 -- [958.808028331 to 4272.75873618]
}

AM (19 february 2007) You are welcome to contribute this to Tcllib :) Just say the word, and I will incorporate it in the package.

EKB I'd be delighted to have it in Tcllib! What should I do to make that happen?

LV One place developers can start is visiting http://tcllib.sf.net/ and posting a new Feature Request, specifying that you are the creator of the code in question, specifying the license you use (tcllib uses BSD based licensing, I believe), and mention that we'd asked you to contribute it. Man pages, test cases, demos, etc. are all wonderful things to have available, if possible. But if they are not available and you don't have time/interest in generating them, then perhaps someone who is a fan of the module could write them.

EKB Thanks. Sounds good. I'll work on creating demos and a man page.

EKB Done! I've submitted it, with a plain text man page (no *roff) to tcllib at sf.net. I hope I've included what's needed!


Philou - 2015-01-09 09:59:57

Nice ! Thanks for this