[EKB] The ::math::statistics package in tcllib includes a linear model procedure for a single variable in which Y = a * X + b + error However, sometimes there is a need to do multivariate regression of the form Y = a1 * X1 + a2 * X2 + ... + aN * XN + b + error This page has an implementation of multivariate linear regression. (I adapted it from my own [Substantively Weighted Least Squares (SWLS)] page.) Here's the library (in the mvlinreg namespace): # Copyright 2007 Eric Kemp-Benedict # This library may be used and modified without notification to the author. # It would be greatly appreciated if corrections and improvements were # submitted to the appropriate page on the Tcler's Wiki. package require Tcl 8.4 package require math::linearalgebra 1.0 package require math::statistics 0.1.1 # mvlinreg = Multivariate Linear Regression namespace eval mvlinreg { variable epsilon 1.0e-7 namespace import -force \ ::math::linearalgebra::mkMatrix \ ::math::linearalgebra::mkVector \ ::math::linearalgebra::mkIdentity \ ::math::linearalgebra::mkDiagonal \ ::math::linearalgebra::getrow \ ::math::linearalgebra::setrow \ ::math::linearalgebra::getcol \ ::math::linearalgebra::setcol \ ::math::linearalgebra::getelem \ ::math::linearalgebra::setelem \ ::math::linearalgebra::dotproduct \ ::math::linearalgebra::matmul \ ::math::linearalgebra::add \ ::math::linearalgebra::sub \ ::math::linearalgebra::solveGauss # NOTE: The authors of math::linearalgebra forgot to # export ::math::linearalgebra::transpose ######################################## # # t-stats # ######################################## # mvlinreg::tstat n ?alpha? # Returns inverse of the single-tailed t distribution # given number of degrees of freedom & confidence # Defaults to alpha = 0.05 # Iterates until result is within epsilon of the target proc tstat {n {alpha 0.05}} { variable epsilon variable tvals if [info exists tvals($n:$alpha)] { return $tvals($n:$alpha) } set deltat [expr {100 * $epsilon}] # For one-tailed distribution, set ptarg [expr {1.000 - $alpha/2.0}] set maxiter 100 # Initial value for t set t 2.0 set niter 0 while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} { set pstar [::math::statistics::cdf-students-t $n $t] set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]] set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]] set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}] incr niter if {$niter == $maxiter} { error "mvlinreg::tstat: Did not converge after $niter iterations" return {} } } # Cache the result to shorten the call in future set tvals($n:$alpha) $t return $t } ######################################## # # Weighted Least Squares # ######################################## # mvlinreg::wls w [list y x's] w [list y x's] ... # Returns: # R-squared # Adj R-squared # coefficients of x's in fit # standard errors of the coefficients # 95% confidence bounds for coefficients proc wls {args} { # Fill the matrices of x & y values, and weights # For n points, k coefficients # The number of points is equal to half the arguments (n weights, n points) set n [expr {[llength $args]/2}] set firstloop true # Sum up all y values to take an average set ysum 0 # Add up the weights set wtsum 0 # Count over rows (points) as you go set point 0 foreach {wt pt} $args { # Check inputs if {[string is double $wt] == 0} { error "mvlinreg::wls: Weight \"$wt\" is not a number" return {} } ## -- Check dimensions, initialize if $firstloop { # k = num of vals in pt = 1 + number of x's (because of constant) set k [llength $pt] if {$n <= [expr {$k + 1}]} { error "mvlinreg::wls: Too few degrees of freedom: $k variables but only $n points" return {} } set X [mkMatrix $n $k] set y [mkVector $n] set I_x [mkIdentity $k] set I_y [mkIdentity $n] set firstloop false } else { # Have to have same number of x's for all points if {$k != [llength $pt]} { error "mvlinreg::wls: Point \"$pt\" has wrong number of values (expected $k)" # Clean up return {} } } ## -- Extract values from set of points # Make a list of y values set yval [expr {double([lindex $pt 0])}] setelem y $point $yval set ysum [expr {$ysum + $wt * $yval}] set wtsum [expr {$wtsum + $wt}] # Add x-values to the x-matrix set xrow [lrange $pt 1 end] # Add the constant (value = 1.0) lappend xrow 1.0 setrow X $point $xrow # Create list of weights & square root of weights lappend w [expr {double($wt)}] lappend sqrtw [expr {sqrt(double($wt))}] incr point } set ymean [expr {double($ysum)/$wtsum}] set W [mkDiagonal $w] set sqrtW [mkDiagonal $sqrtw] # Calculate sum os square differences for x's for {set r 0} {$r < $k} {incr r} { set xsqrsum 0.0 set xmeansum 0.0 # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n for {set t 0} {$t < $n} {incr t} { set xval [getelem $X $t $r] set xmeansum [expr {$xmeansum + double($xval)}] set xsqrsum [expr {$xsqrsum + double($xval * $xval)}] } lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}] } ## -- Set up the X'WX matrix set XtW [matmul [::math::linearalgebra::transpose $X] $W] set XtWX [matmul $XtW $X] # Invert set M [solveGauss $XtWX $I_x] set beta [matmul $M [matmul $XtW $y]] ### -- Residuals & R-squared # 1) Generate list of diagonals of the hat matrix set H [matmul $X [matmul $M $XtW]] for {set i 0} {$i < $n} {incr i} { lappend h_ii [getelem $H $i $i] } set R [matmul $sqrtW [matmul [sub $I_y $H] $y]] set yhat [matmul $H $y] # 2) Generate list of residuals, sum of squared residuals, r-squared set sstot 0.0 set ssreg 0.0 # Note: Relying on representation of Vector as a list for y, yhat foreach yval $y wt $w yhatval $yhat { set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}] set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}] } set r2 [expr {double($ssreg)/$sstot}] set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}] set sumsqresid [dotproduct $R $R] set s2 [expr {double($sumsqresid) / double($n - $k)}] ### -- Confidence intervals for coefficients set tvalue [tstat [expr {$n - $k}]] for {set i 0} {$i < $k} {incr i} { set stderr [expr {sqrt($s2 * [getelem $M $i $i])}] set mid [lindex $beta $i] lappend stderrs $stderr lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]] } return [list $r2 $adjr2 $beta $stderrs $confinterval] } ######################################## # # Ordinary (unweighted) Least Squares # ######################################## # mvlinreg::ols [list y x's] [list y x's] ... # Returns: # R-squared # Adj R-squared # coefficients of x's in fit # standard errors of the coefficients # 95% confidence bounds for coefficients proc ols {args} { set newargs {} foreach pt $args { lappend newargs 1 $pt } return [eval wls $newargs] } } if 0 { Here's an example: } ## ## ## TEST IT ## ## set data {{183.34 100. 9.43} \ {222.67 100. 9.14} \ {410.77 20.3 8.81} \ {268.57 17.53 8.74} \ {499.28 10.95 8.71} \ {385.97 51.81 8.78} \ {474.53 17.01 8.74} \ {550.04 16.6 8.72} \ {221.54 100. 9.49} \ {284.93 49.9 9.01} \ {324.19 23.02 9.37} \ {364.6 16.59 8.7} \ {338.62 20.03 9.13} \ {543.19 13.44 8.75} \ {356.96 25.64 8.94} \ {296.54 18.63 8.74} \ {336.98 50.07 8.82} \ {398.5 15.35 8.85} \ {216.37 47.41 9.32} \ {325.02 14.83 8.9} \ {315.55 9.43 8.83} \ {321.57 76.98 8.89} \ {443.21 51.28 8.8} \ {493.22 13.19 8.69} \ {554.79 19.62 8.89} \ {527.46 63.23 8.75} \ {490.59 10.72 8.72} \ {389.31 14.39 8.73} \ {218.9 8.65 8.71} \ {315.65 7.29 8.84}} set results [eval mvlinreg::ols $data] puts "R-squared: [lindex $results 0]" puts "Adj R-squared: [lindex $results 1]" puts "Coefficients \u00B1 s.e. -- \[95% confidence interval\]:" foreach val [lindex $results 2] se [lindex $results 3] bounds [lindex $results 4] { set lb [lindex $bounds 0] set ub [lindex $bounds 1] puts " $val \u00B1 $se -- \[$lb to $ub\]" } if 0 { This should give the following output: R-squared: 0.371918955025 Adj R-squared: 0.325394433175 Coefficients ± s.e. -- [95% confidence interval]: -0.432215837601 ± 0.744774375289 -- [-1.96036662835 to 1.09593495315] -250.868151309 ± 92.4723901677 -- [-440.605823342 to -61.1304792767] 2615.78338226 ± 807.559562552 -- [958.808028331 to 4272.75873618] } ---- [AM] (19 february 2007) You are welcome to contribute this to Tcllib :) Just say the word, and I will incorporate it in the package. ---- [Category Mathematics] - [Category Statistics]