[EKB] The ::math::statistics package in tcllib includes a linear model procedure for a single variable in which Y = a * X + b + error However, sometimes there is a need to do multivariate regression of the form Y = a1 * X1 + a2 * X2 + ... + aN * XN + b + error This page has an implementation of multivariate linear regression. (I adapted it from my own [Substantively Weighted Least Squares (SWLS)] page.) Here's the library (in the mvlinreg namespace): ====== # Copyright 2007 Eric Kemp-Benedict # This library may be used and modified without notification to the author. # It would be greatly appreciated if corrections and improvements were # submitted to the appropriate page on the Tcler's Wiki. package require Tcl 8.4 package require math::linearalgebra 1.0 package require math::statistics 0.1.1 # mvlinreg = Multivariate Linear Regression namespace eval mvlinreg { variable epsilon 1.0e-7 namespace import -force \ ::math::linearalgebra::mkMatrix \ ::math::linearalgebra::mkVector \ ::math::linearalgebra::mkIdentity \ ::math::linearalgebra::mkDiagonal \ ::math::linearalgebra::getrow \ ::math::linearalgebra::setrow \ ::math::linearalgebra::getcol \ ::math::linearalgebra::setcol \ ::math::linearalgebra::getelem \ ::math::linearalgebra::setelem \ ::math::linearalgebra::dotproduct \ ::math::linearalgebra::matmul \ ::math::linearalgebra::add \ ::math::linearalgebra::sub \ ::math::linearalgebra::solveGauss # NOTE: The authors of math::linearalgebra forgot to # export ::math::linearalgebra::transpose ######################################## # # t-stats # ######################################## # mvlinreg::tstat n ?alpha? # Returns inverse of the single-tailed t distribution # given number of degrees of freedom & confidence # Defaults to alpha = 0.05 # Iterates until result is within epsilon of the target proc tstat {n {alpha 0.05}} { variable epsilon variable tvals if [info exists tvals($n:$alpha)] { return $tvals($n:$alpha) } set deltat [expr {100 * $epsilon}] # For one-tailed distribution, set ptarg [expr {1.000 - $alpha/2.0}] set maxiter 100 # Initial value for t set t 2.0 set niter 0 while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} { set pstar [::math::statistics::cdf-students-t $n $t] set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]] set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]] set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}] incr niter if {$niter == $maxiter} { error "mvlinreg::tstat: Did not converge after $niter iterations" return {} } } # Cache the result to shorten the call in future set tvals($n:$alpha) $t return $t } ######################################## # # Weighted Least Squares # ######################################## # mvlinreg::wls w [list y x's] w [list y x's] ... # Returns: # R-squared # Adj R-squared # coefficients of x's in fit # standard errors of the coefficients # 95% confidence bounds for coefficients proc wls {args} { # Fill the matrices of x & y values, and weights # For n points, k coefficients # The number of points is equal to half the arguments (n weights, n points) set n [expr {[llength $args]/2}] set firstloop true # Sum up all y values to take an average set ysum 0 # Add up the weights set wtsum 0 # Count over rows (points) as you go set point 0 foreach {wt pt} $args { # Check inputs if {[string is double $wt] == 0} { error "mvlinreg::wls: Weight \"$wt\" is not a number" return {} } ## -- Check dimensions, initialize if $firstloop { # k = num of vals in pt = 1 + number of x's (because of constant) set k [llength $pt] if {$n <= [expr {$k + 1}]} { error "mvlinreg::wls: Too few degrees of freedom: $k variables but only $n points" return {} } set X [mkMatrix $n $k] set y [mkVector $n] set I_x [mkIdentity $k] set I_y [mkIdentity $n] set firstloop false } else { # Have to have same number of x's for all points if {$k != [llength $pt]} { error "mvlinreg::wls: Point \"$pt\" has wrong number of values (expected $k)" # Clean up return {} } } ## -- Extract values from set of points # Make a list of y values set yval [expr {double([lindex $pt 0])}] setelem y $point $yval set ysum [expr {$ysum + $wt * $yval}] set wtsum [expr {$wtsum + $wt}] # Add x-values to the x-matrix set xrow [lrange $pt 1 end] # Add the constant (value = 1.0) lappend xrow 1.0 setrow X $point $xrow # Create list of weights & square root of weights lappend w [expr {double($wt)}] lappend sqrtw [expr {sqrt(double($wt))}] incr point } set ymean [expr {double($ysum)/$wtsum}] set W [mkDiagonal $w] set sqrtW [mkDiagonal $sqrtw] # Calculate sum os square differences for x's for {set r 0} {$r < $k} {incr r} { set xsqrsum 0.0 set xmeansum 0.0 # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n for {set t 0} {$t < $n} {incr t} { set xval [getelem $X $t $r] set xmeansum [expr {$xmeansum + double($xval)}] set xsqrsum [expr {$xsqrsum + double($xval * $xval)}] } lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}] } ## -- Set up the X'WX matrix set XtW [matmul [::math::linearalgebra::transpose $X] $W] set XtWX [matmul $XtW $X] # Invert set M [solveGauss $XtWX $I_x] set beta [matmul $M [matmul $XtW $y]] ### -- Residuals & R-squared # 1) Generate list of diagonals of the hat matrix set H [matmul $X [matmul $M $XtW]] for {set i 0} {$i < $n} {incr i} { lappend h_ii [getelem $H $i $i] } set R [matmul $sqrtW [matmul [sub $I_y $H] $y]] set yhat [matmul $H $y] # 2) Generate list of residuals, sum of squared residuals, r-squared set sstot 0.0 set ssreg 0.0 # Note: Relying on representation of Vector as a list for y, yhat foreach yval $y wt $w yhatval $yhat { set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}] set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}] } set r2 [expr {double($ssreg)/$sstot}] set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}] set sumsqresid [dotproduct $R $R] set s2 [expr {double($sumsqresid) / double($n - $k)}] ### -- Confidence intervals for coefficients set tvalue [tstat [expr {$n - $k}]] for {set i 0} {$i < $k} {incr i} { set stderr [expr {sqrt($s2 * [getelem $M $i $i])}] set mid [lindex $beta $i] lappend stderrs $stderr lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]] } return [list $r2 $adjr2 $beta $stderrs $confinterval] } ######################################## # # Ordinary (unweighted) Least Squares # ######################################## # mvlinreg::ols [list y x's] [list y x's] ... # Returns: # R-squared # Adj R-squared # coefficients of x's in fit # standard errors of the coefficients # 95% confidence bounds for coefficients proc ols {args} { set newargs {} foreach pt $args { lappend newargs 1 $pt } return [eval wls $newargs] } } if 0 { Here's an example: } ## ## ## TEST IT ## ## set data {{183.34 100. 9.43} \ {222.67 100. 9.14} \ {410.77 20.3 8.81} \ {268.57 17.53 8.74} \ {499.28 10.95 8.71} \ {385.97 51.81 8.78} \ {474.53 17.01 8.74} \ {550.04 16.6 8.72} \ {221.54 100. 9.49} \ {284.93 49.9 9.01} \ {324.19 23.02 9.37} \ {364.6 16.59 8.7} \ {338.62 20.03 9.13} \ {543.19 13.44 8.75} \ {356.96 25.64 8.94} \ {296.54 18.63 8.74} \ {336.98 50.07 8.82} \ {398.5 15.35 8.85} \ {216.37 47.41 9.32} \ {325.02 14.83 8.9} \ {315.55 9.43 8.83} \ {321.57 76.98 8.89} \ {443.21 51.28 8.8} \ {493.22 13.19 8.69} \ {554.79 19.62 8.89} \ {527.46 63.23 8.75} \ {490.59 10.72 8.72} \ {389.31 14.39 8.73} \ {218.9 8.65 8.71} \ {315.65 7.29 8.84}} set results [eval mvlinreg::ols $data] puts "R-squared: [lindex $results 0]" puts "Adj R-squared: [lindex $results 1]" puts "Coefficients \u00B1 s.e. -- \[95% confidence interval\]:" foreach val [lindex $results 2] se [lindex $results 3] bounds [lindex $results 4] { set lb [lindex $bounds 0] set ub [lindex $bounds 1] puts " $val \u00B1 $se -- \[$lb to $ub\]" } if 0 { This should give the following output: R-squared: 0.371918955025 Adj R-squared: 0.325394433175 Coefficients ± s.e. -- [95% confidence interval]: -0.432215837601 ± 0.744774375289 -- [-1.96036662835 to 1.09593495315] -250.868151309 ± 92.4723901677 -- [-440.605823342 to -61.1304792767] 2615.78338226 ± 807.559562552 -- [958.808028331 to 4272.75873618] } ====== ---- [AM] (19 february 2007) You are welcome to contribute this to Tcllib :) Just say the word, and I will incorporate it in the package. [EKB] I'd be delighted to have it in Tcllib! What should I do to make that happen? [LV] One place developers can start is visiting http://tcllib.sf.net/ and posting a new Feature Request, specifying that you are the creator of the code in question, specifying the license you use (tcllib uses BSD based licensing, I believe), and mention that we'd asked you to contribute it. Man pages, test cases, demos, etc. are all wonderful things to have available, if possible. But if they are not available and you don't have time/interest in generating them, then perhaps someone who is a fan of the module could write them. [EKB] Thanks. Sounds good. I'll work on creating demos and a man page. [EKB] Done! I've submitted it, with a plain text man page (no *roff) to tcllib at sf.net. I hope I've included what's needed! <> Mathematics | Statistics