Version 1 of Multivariate Linear Regression

Updated 2007-02-17 14:36:57

EKB The ::math::statistics package in tcllib includes a linear model procedure for a single variable in which

Y = a * X + b + error

However, sometimes there is a need to do multivariate regression of the form

Y = a1 * X1 + a2 * X2 + ... + aN * XN + b + error

This page has an implementation of multivariate linear regression.

Here's the library (in the mvlinreg namespace):

    package require Tcl 8.4
    package require math::linearalgebra 1.0
    package require math::statistics 0.1.1

    # mvlinreg = Multivariate Linear Regression
    namespace eval mvlinreg {
        variable epsilon 1.0e-7

        namespace import -force \
            ::math::linearalgebra::mkMatrix \
            ::math::linearalgebra::mkVector \
            ::math::linearalgebra::mkIdentity \
            ::math::linearalgebra::mkDiagonal \
            ::math::linearalgebra::getrow \
            ::math::linearalgebra::setrow \
            ::math::linearalgebra::getcol \
            ::math::linearalgebra::setcol \
            ::math::linearalgebra::getelem \
            ::math::linearalgebra::setelem \
            ::math::linearalgebra::dotproduct \
            ::math::linearalgebra::matmul \
            ::math::linearalgebra::add \
            ::math::linearalgebra::sub \
            ::math::linearalgebra::solveGauss

        # NOTE: The authors of math::linearalgebra forgot to
        #   export ::math::linearalgebra::transpose

        ########################################
        #
        # t-stats
        #
        ########################################
        # mvlinreg::tstat n ?alpha?
        # Returns inverse of the single-tailed t distribution
        #  given number of degrees of freedom & confidence
        # Defaults to alpha = 0.05
        # Iterates until result is within epsilon of the target
        proc tstat {n {alpha 0.05}} {
            variable epsilon
            variable tvals

            if [info exists tvals($n:$alpha)] {
                return $tvals($n:$alpha)
            }

            set deltat [expr {100 * $epsilon}]
            # For one-tailed distribution, 
            set ptarg [expr {1.000 - $alpha/2.0}]
            set maxiter 100

            # Initial value for t
            set t 2.0

            set niter 0
            while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} {
                set pstar [::math::statistics::cdf-students-t $n $t]
                set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]]
                set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]]

                set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}]

                incr niter
                if {$niter == $maxiter} {
                    error "mvlinreg::tstat: Did not converge after $niter iterations"
                    return {}
                }
            }

            # Cache the result to shorten the call in future
            set tvals($n:$alpha) $t

            return $t
        }

        ########################################
        #
        # Weighted Least Squares
        #
        ########################################
        # mvlinreg::wls w [list y x's] w [list y x's] ...
        # Returns:
        #   R-squared
        #   Adj R-squared
        #   coefficients of x's in fit
        #   standard errors of the coefficients
        #   95% confidence bounds for coefficients
        proc wls {args} {

            # Fill the matrices of x & y values, and weights
            # For n points, k coefficients

            # The number of points is equal to half the arguments (n weights, n points)
            set n [expr {[llength $args]/2}]

            set firstloop true
            # Sum up all y values to take an average
            set ysum 0
            # Add up the weights
            set wtsum 0
            # Count over rows (points) as you go
            set point 0
            foreach {wt pt} $args {

                # Check inputs
                if {[string is double $wt] == 0} {
                    error "mvlinreg::wls: Weight \"$wt\" is not a number"
                    return {}
                }

                ## -- Check dimensions, initialize
                if $firstloop {
                    # k = num of vals in pt = 1 + number of x's (because of constant)
                    set k [llength $pt]
                    if {$n <= [expr {$k + 1}]} {
                        error "mvlinreg::wls: Too few degrees of freedom: $k variables but only $n points"
                        return {}
                    }
                    set X [mkMatrix $n $k]
                    set y [mkVector $n]
                    set I_x [mkIdentity $k]
                    set I_y [mkIdentity $n]

                    set firstloop false

                } else {
                    # Have to have same number of x's for all points
                    if {$k != [llength $pt]} {
                        error "mvlinreg::wls: Point \"$pt\" has wrong number of values (expected $k)"
                        # Clean up
                        return {}
                    }
                }

                ## -- Extract values from set of points
                # Make a list of y values
                set yval [expr {double([lindex $pt 0])}]
                setelem y $point $yval
                set ysum [expr {$ysum + $wt * $yval}]
                set wtsum [expr {$wtsum + $wt}]
                # Add x-values to the x-matrix
                set xrow [lrange $pt 1 end]
                # Add the constant (value = 1.0)
                lappend xrow 1.0
                setrow X $point $xrow
                # Create list of weights & square root of weights
                lappend w [expr {double($wt)}]
                lappend sqrtw [expr {sqrt(double($wt))}]

                incr point

            }

            set ymean [expr {double($ysum)/$wtsum}]
            set W [mkDiagonal $w]
            set sqrtW [mkDiagonal $sqrtw]

            # Calculate sum os square differences for x's
            for {set r 0} {$r < $k} {incr r} {
                set xsqrsum 0.0
                set xmeansum 0.0
                # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n
                for {set t 0} {$t < $n} {incr t} {
                    set xval [getelem $X $t $r]
                    set xmeansum [expr {$xmeansum + double($xval)}]
                    set xsqrsum [expr {$xsqrsum + double($xval * $xval)}]
                }
                lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}]
            }

            ## -- Set up the X'WX matrix
            set XtW [matmul [::math::linearalgebra::transpose $X] $W]
            set XtWX [matmul $XtW $X]

            # Invert
            set M [solveGauss $XtWX $I_x]

            set beta [matmul $M [matmul $XtW $y]]

            ### -- Residuals & R-squared
            # 1) Generate list of diagonals of the hat matrix
            set H [matmul $X [matmul $M $XtW]]
            for {set i 0} {$i < $n} {incr i} {
                lappend h_ii [getelem $H $i $i]
            }

            set R [matmul $sqrtW [matmul [sub $I_y $H] $y]]
            set yhat [matmul $H $y]

            # 2) Generate list of residuals, sum of squared residuals, r-squared
            set sstot 0.0
            set ssreg 0.0
            # Note: Relying on representation of Vector as a list for y, yhat
            foreach yval $y wt $w yhatval $yhat {
                set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}]
                set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}]
            }
            set r2 [expr {double($ssreg)/$sstot}]
            set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}]
            set sumsqresid [dotproduct $R $R]
            set s2 [expr {double($sumsqresid) / double($n - $k)}]

            ### -- Confidence intervals for coefficients
            set tvalue [tstat [expr {$n - $k}]]
            for {set i 0} {$i < $k} {incr i} {
                set stderr [expr {sqrt($s2 * [getelem $M $i $i])}]
                set mid [lindex $beta $i]
                lappend stderrs $stderr
                lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]]
            }

            return [list $r2 $adjr2 $beta $stderrs $confinterval]
        }

        ########################################
        #
        # Ordinary (unweighted) Least Squares
        #
        ########################################
        # mvlinreg::ols [list y x's] [list y x's] ...
        # Returns:
        #   R-squared
        #   Adj R-squared
        #   coefficients of x's in fit
        #   standard errors of the coefficients
        #   95% confidence bounds for coefficients
        proc ols {args} {
            set newargs {}
            foreach pt $args {
                lappend newargs 1 $pt
            }
            return [eval wls $newargs]
        }
    }

if 0 { Here's an example: }

    ##
    ##
    ## TEST IT
    ##
    ##

    set data {{183.34 100. 9.43} \
    {222.67 100. 9.14} \
    {410.77 20.3 8.81} \
    {268.57 17.53 8.74} \
    {499.28 10.95 8.71} \
    {385.97 51.81 8.78} \
    {474.53 17.01 8.74} \
    {550.04 16.6 8.72} \
    {221.54 100. 9.49} \
    {284.93 49.9 9.01} \
    {324.19 23.02 9.37} \
    {364.6 16.59 8.7} \
    {338.62 20.03 9.13} \
    {543.19 13.44 8.75} \
    {356.96 25.64 8.94} \
    {296.54 18.63 8.74} \
    {336.98 50.07 8.82} \
    {398.5 15.35 8.85} \
    {216.37 47.41 9.32} \
    {325.02 14.83 8.9} \
    {315.55 9.43 8.83} \
    {321.57 76.98 8.89} \
    {443.21 51.28 8.8} \
    {493.22 13.19 8.69} \
    {554.79 19.62 8.89} \
    {527.46 63.23 8.75} \
    {490.59 10.72 8.72} \
    {389.31 14.39 8.73} \
    {218.9 8.65 8.71} \
    {315.65 7.29 8.84}}

    set results [eval mvlinreg::ols $data]

    puts "R-squared: [lindex $results 0]"
    puts "Adj R-squared: [lindex $results 1]"
    puts "Coefficients \u00B1 s.e. -- \[95% confidence interval\]:"
    foreach val [lindex $results 2] se [lindex $results 3] bounds [lindex $results 4] {
        set lb [lindex $bounds 0]
        set ub [lindex $bounds 1]
        puts "   $val \u00B1 $se -- \[$lb to $ub\]"
    }

if 0 { This should give the following output:

 R-squared: 0.371918955025
 Adj R-squared: 0.325394433175
 Coefficients ± s.e. -- [95% confidence interval]:
   -0.432215837601 ± 0.744774375289 -- [-1.96036662835 to 1.09593495315]
   -250.868151309 ± 92.4723901677 -- [-440.605823342 to -61.1304792767]
   2615.78338226 ± 807.559562552 -- [958.808028331 to 4272.75873618]

}


Category Mathematics - Category Statistics