Version 10 of Multivariate Linear Regression

Updated 2007-06-30 16:36:53 by dkf

EKB The ::math::statistics package in tcllib includes a linear model procedure for a single variable in which

Y = a * X + b + error

However, sometimes there is a need to do multivariate regression of the form

Y = a1 * X1 + a2 * X2 + ... + aN * XN + b + error

This page has an implementation of multivariate linear regression. (I adapted it from my own Substantively Weighted Least Squares (SWLS) page.)

Here's the library (in the mvlinreg namespace):

    # Copyright 2007 Eric Kemp-Benedict
    # This library may be used and modified without notification to the author.
    # It would be greatly appreciated if corrections and improvements were
    # submitted to the appropriate page on the Tcler's Wiki.
    package require Tcl 8.4
    package require math::linearalgebra 1.0
    package require math::statistics 0.1.1

    # mvlinreg = Multivariate Linear Regression
    namespace eval mvlinreg {
        variable epsilon 1.0e-7

        namespace import -force \
            ::math::linearalgebra::mkMatrix \
            ::math::linearalgebra::mkVector \
            ::math::linearalgebra::mkIdentity \
            ::math::linearalgebra::mkDiagonal \
            ::math::linearalgebra::getrow \
            ::math::linearalgebra::setrow \
            ::math::linearalgebra::getcol \
            ::math::linearalgebra::setcol \
            ::math::linearalgebra::getelem \
            ::math::linearalgebra::setelem \
            ::math::linearalgebra::dotproduct \
            ::math::linearalgebra::matmul \
            ::math::linearalgebra::add \
            ::math::linearalgebra::sub \
            ::math::linearalgebra::solveGauss

        # NOTE: The authors of math::linearalgebra forgot to
        #   export ::math::linearalgebra::transpose

        ########################################
        #
        # t-stats
        #
        ########################################
        # mvlinreg::tstat n ?alpha?
        # Returns inverse of the single-tailed t distribution
        #  given number of degrees of freedom & confidence
        # Defaults to alpha = 0.05
        # Iterates until result is within epsilon of the target
        proc tstat {n {alpha 0.05}} {
            variable epsilon
            variable tvals

            if [info exists tvals($n:$alpha)] {
                return $tvals($n:$alpha)
            }

            set deltat [expr {100 * $epsilon}]
            # For one-tailed distribution, 
            set ptarg [expr {1.000 - $alpha/2.0}]
            set maxiter 100

            # Initial value for t
            set t 2.0

            set niter 0
            while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} {
                set pstar [::math::statistics::cdf-students-t $n $t]
                set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]]
                set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]]

                set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}]

                incr niter
                if {$niter == $maxiter} {
                    error "mvlinreg::tstat: Did not converge after $niter iterations"
                    return {}
                }
            }

            # Cache the result to shorten the call in future
            set tvals($n:$alpha) $t

            return $t
        }

        ########################################
        #
        # Weighted Least Squares
        #
        ########################################
        # mvlinreg::wls w [list y x's] w [list y x's] ...
        # Returns:
        #   R-squared
        #   Adj R-squared
        #   coefficients of x's in fit
        #   standard errors of the coefficients
        #   95% confidence bounds for coefficients
        proc wls {args} {

            # Fill the matrices of x & y values, and weights
            # For n points, k coefficients

            # The number of points is equal to half the arguments (n weights, n points)
            set n [expr {[llength $args]/2}]

            set firstloop true
            # Sum up all y values to take an average
            set ysum 0
            # Add up the weights
            set wtsum 0
            # Count over rows (points) as you go
            set point 0
            foreach {wt pt} $args {

                # Check inputs
                if {[string is double $wt] == 0} {
                    error "mvlinreg::wls: Weight \"$wt\" is not a number"
                    return {}
                }

                ## -- Check dimensions, initialize
                if $firstloop {
                    # k = num of vals in pt = 1 + number of x's (because of constant)
                    set k [llength $pt]
                    if {$n <= [expr {$k + 1}]} {
                        error "mvlinreg::wls: Too few degrees of freedom: $k variables but only $n points"
                        return {}
                    }
                    set X [mkMatrix $n $k]
                    set y [mkVector $n]
                    set I_x [mkIdentity $k]
                    set I_y [mkIdentity $n]

                    set firstloop false

                } else {
                    # Have to have same number of x's for all points
                    if {$k != [llength $pt]} {
                        error "mvlinreg::wls: Point \"$pt\" has wrong number of values (expected $k)"
                        # Clean up
                        return {}
                    }
                }

                ## -- Extract values from set of points
                # Make a list of y values
                set yval [expr {double([lindex $pt 0])}]
                setelem y $point $yval
                set ysum [expr {$ysum + $wt * $yval}]
                set wtsum [expr {$wtsum + $wt}]
                # Add x-values to the x-matrix
                set xrow [lrange $pt 1 end]
                # Add the constant (value = 1.0)
                lappend xrow 1.0
                setrow X $point $xrow
                # Create list of weights & square root of weights
                lappend w [expr {double($wt)}]
                lappend sqrtw [expr {sqrt(double($wt))}]

                incr point

            }

            set ymean [expr {double($ysum)/$wtsum}]
            set W [mkDiagonal $w]
            set sqrtW [mkDiagonal $sqrtw]

            # Calculate sum os square differences for x's
            for {set r 0} {$r < $k} {incr r} {
                set xsqrsum 0.0
                set xmeansum 0.0
                # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n
                for {set t 0} {$t < $n} {incr t} {
                    set xval [getelem $X $t $r]
                    set xmeansum [expr {$xmeansum + double($xval)}]
                    set xsqrsum [expr {$xsqrsum + double($xval * $xval)}]
                }
                lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}]
            }

            ## -- Set up the X'WX matrix
            set XtW [matmul [::math::linearalgebra::transpose $X] $W]
            set XtWX [matmul $XtW $X]

            # Invert
            set M [solveGauss $XtWX $I_x]

            set beta [matmul $M [matmul $XtW $y]]

            ### -- Residuals & R-squared
            # 1) Generate list of diagonals of the hat matrix
            set H [matmul $X [matmul $M $XtW]]
            for {set i 0} {$i < $n} {incr i} {
                lappend h_ii [getelem $H $i $i]
            }

            set R [matmul $sqrtW [matmul [sub $I_y $H] $y]]
            set yhat [matmul $H $y]

            # 2) Generate list of residuals, sum of squared residuals, r-squared
            set sstot 0.0
            set ssreg 0.0
            # Note: Relying on representation of Vector as a list for y, yhat
            foreach yval $y wt $w yhatval $yhat {
                set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}]
                set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}]
            }
            set r2 [expr {double($ssreg)/$sstot}]
            set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}]
            set sumsqresid [dotproduct $R $R]
            set s2 [expr {double($sumsqresid) / double($n - $k)}]

            ### -- Confidence intervals for coefficients
            set tvalue [tstat [expr {$n - $k}]]
            for {set i 0} {$i < $k} {incr i} {
                set stderr [expr {sqrt($s2 * [getelem $M $i $i])}]
                set mid [lindex $beta $i]
                lappend stderrs $stderr
                lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]]
            }

            return [list $r2 $adjr2 $beta $stderrs $confinterval]
        }

        ########################################
        #
        # Ordinary (unweighted) Least Squares
        #
        ########################################
        # mvlinreg::ols [list y x's] [list y x's] ...
        # Returns:
        #   R-squared
        #   Adj R-squared
        #   coefficients of x's in fit
        #   standard errors of the coefficients
        #   95% confidence bounds for coefficients
        proc ols {args} {
            set newargs {}
            foreach pt $args {
                lappend newargs 1 $pt
            }
            return [eval wls $newargs]
        }
    }

if 0 { Here's an example: }

    ##
    ##
    ## TEST IT
    ##
    ##

    set data {{183.34 100. 9.43} \
    {222.67 100. 9.14} \
    {410.77 20.3 8.81} \
    {268.57 17.53 8.74} \
    {499.28 10.95 8.71} \
    {385.97 51.81 8.78} \
    {474.53 17.01 8.74} \
    {550.04 16.6 8.72} \
    {221.54 100. 9.49} \
    {284.93 49.9 9.01} \
    {324.19 23.02 9.37} \
    {364.6 16.59 8.7} \
    {338.62 20.03 9.13} \
    {543.19 13.44 8.75} \
    {356.96 25.64 8.94} \
    {296.54 18.63 8.74} \
    {336.98 50.07 8.82} \
    {398.5 15.35 8.85} \
    {216.37 47.41 9.32} \
    {325.02 14.83 8.9} \
    {315.55 9.43 8.83} \
    {321.57 76.98 8.89} \
    {443.21 51.28 8.8} \
    {493.22 13.19 8.69} \
    {554.79 19.62 8.89} \
    {527.46 63.23 8.75} \
    {490.59 10.72 8.72} \
    {389.31 14.39 8.73} \
    {218.9 8.65 8.71} \
    {315.65 7.29 8.84}}

    set results [eval mvlinreg::ols $data]

    puts "R-squared: [lindex $results 0]"
    puts "Adj R-squared: [lindex $results 1]"
    puts "Coefficients \u00B1 s.e. -- \[95% confidence interval\]:"
    foreach val [lindex $results 2] se [lindex $results 3] bounds [lindex $results 4] {
        set lb [lindex $bounds 0]
        set ub [lindex $bounds 1]
        puts "   $val \u00B1 $se -- \[$lb to $ub\]"
    }

if 0 { This should give the following output:

 R-squared: 0.371918955025
 Adj R-squared: 0.325394433175
 Coefficients ± s.e. -- [95% confidence interval]:
   -0.432215837601 ± 0.744774375289 -- [-1.96036662835 to 1.09593495315]
   -250.868151309 ± 92.4723901677 -- [-440.605823342 to -61.1304792767]
   2615.78338226 ± 807.559562552 -- [958.808028331 to 4272.75873618]

}


AM (19 february 2007) You are welcome to contribute this to Tcllib :) Just say the word, and I will incorporate it in the package.

EKB I'd be delighted to have it in Tcllib! What should I do to make that happen?

LV One place developers can start is visiting http://tcllib.sf.net/ and posting a new Feature Request, specifying that you are the creator of the code in question, specifying the license you use (tcllib uses BSD based licensing, I believe), and mention that we'd asked you to contribute it. Man pages, test cases, demos, etc. are all wonderful things to have available, if possible. But if they are not available and you don't have time/interest in generating them, then perhaps someone who is a fan of the module could write them.

EKB Thanks. Sounds good. I'll work on creating demos and a man page.

EKB Done! I've submitted it, with a plain text man page (no *roff) to tcllib at sf.net. I hope I've included what's needed!


Category Mathematics - Category Statistics