Code in this package first appeared on the O'Reilly Network on July 23, 2004.
by Paul Meagher <paul@datavore.com>
Last modified March 31 2006 @ 1:23 pm UTC
Demos:
Source code listing:
<?php
/**
* @package SFA
*
* @author Paul Meagher, Datavore Productions
* @license PHP v3.0
* @version 0.7
*
* Class implementing Single Factor ANOVA steps.
*/
require_once PHPMATH ."/PDL/FDistribution.php";
class SingleFactorANOVA {
/*
* Database table holding experiment data.
*/
var $table = null;
/**
* Database column storing treatment levels.
*/
var $treatment = null;
/**
* Indexed array of treatment levels.
*/
var $levels = array();
/**
* Stores the number of treatment levels.
*/
var $num_levels = null;
/**
* Database column storing the response measurement.
*/
var $response = null;
/**
* Associative array of treatment sums and grand sum.
*/
var $sums = array();
/**
* Associative array of treatment sample sizes and grand n.
*/
var $n = array();
/**
* Associative array of treatment sample means and grand mean.
*/
var $means = array();
/**
* Associative array of treatment sample variances.
*/
var $variance = array();
/**
* Associative array of treatment effects.
*/
var $effects = array();
/**
* Associative array of treatment, between, and within sums of squares.
*/
var $ss = array();
/**
* Associative array of between and within df values.
*/
var $df = array();
/**
* Associative array of between and within mean square estimates.
*/
var $ms = array();
/**
* Stores the observed F score.
*/
var $f = null;
/**
* Stores the probability of the F score.
*/
var $p = null;
/**
* Alpha setting with traditional .05 default.
*/
var $alpha = .05;
/**
* Stores the critical F score.
*/
var $crit = null;
/**
* Indexed array of means sorted lowest to highest for comparison.
*/
var $diff_means = null;
/**
* Indexed array of treatment labels (shared same index as diff_means).
*/
var $diff_labels = null;
/**
* Matrix of mean differences.
*/
var $diffs = null;
/**
* Constructor that allows you to optionally set the data table, the
* treatment colum and the response column via the constructor.
*/
function SingleFactorANOVA($table=false, $treatment=false, $response=false, $alpha=.05) {
if ( ($table != false) AND ($table != false) AND ($table != false) ) {
$this->table = $table;
$this->treatment = $treatment;
$this->response = $response;
$this->alpha = $alpha;
}
}
/**
* Set database table to use.
*/
function setTable($table) {
$this->table = $table;
}
/**
* Set treatment column.
*/
function setTreatment($column) {
$this->treatment = $column;
}
/**
* Set response column.
*/
function setResponse($column) {
$this->response = $column;
}
/**
* Set alpha level.
*/
function setAlpha($alpha) {
$this->alpha = $alpha;
}
/**
* Convenience function for inserting simulated values into data table.
* Prior to using this function, you must first have created the data table
* then instantiated $this->table, $this->treatment and $this->response to
* the relevant values. See simulation.php example for sample usage.
*/
function insert($level, $responses, $type=float) {
global $db;
foreach($responses AS $response) {
if ($type == "float") {
$y = (float) $response;
}
if ($type == "int") {
$y = (int) $response;
}
$sql = " INSERT INTO $this->table ($this->treatment, $this->response) ";
$sql .= " VALUES ('$level', $y ) ";
$db->query($sql);
}
}
/**
* Purge the data table of existing values - useful for
* simulation work.
*/
function emptyTable() {
global $db;
$sql = " DELETE FROM $this->table ";
$result = $db->query($sql);
if (DB::isError($result)) {
die($result->getMessage());
} else {
return true;
}
}
/**
* Compute single factor ANOVA statistics.
*/
function analyze() {
global $db;
$sql = " SELECT $this->treatment, sum($this->response), ";
$sql .= " sum($this->response * $this->response), count($this->response) ";
$sql .= " FROM $this->table ";
$sql .= " GROUP BY $this->treatment ";
$result = $db->query($sql);
if (DB::isError($result)) {
die($result->getMessage());
} else {
while ($row = $result->fetchRow()) {
$level = $row[0];
$this->levels[] = $row[0];
$this->sums[$level] = $row[1];
$this->n[$level] = $row[3];
$this->means[$level] = $this->sums[$level] / $this->n[$level];
$this->ss[$level] = $row[2] - $this->n[$level] * pow($this->means[$level], 2);
$this->variance[$level] = $this->ss[$level] / ($this->n[$level] - 1);
}
$this->sums["total"] = array_sum($this->sums);
$this->n["total"] = array_sum($this->n);
$this->means["grand"] = $this->sums["total"] / $this->n["total"];
$this->ss["within"] = array_sum($this->ss);
foreach($this->levels as $level) {
$this->effects[$level] = $this->means[$level] - $this->means["grand"];
$this->ss["between"] += $this->n[$level] * pow($this->effects[$level], 2);
}
$this->num_levels = count($this->levels);
$this->df["between"] = $this->num_levels - 1;
$this->df["within"] = $this->n["total"] - $this->num_levels;
$this->ms["between"] = $this->ss["between"] / $this->df["between"];
$this->ms["within"] = $this->ss["within"] / $this->df["within"];
$this->f = $this->ms["between"] / $this->ms["within"];
$F = new FDistribution($this->df["between"], $this->df["within"]);
$this->p = 1 - $F->CDF($this->f);
$this->crit = $F->inverseCDF(1 - $this->alpha);
}
}
/**
* Creates an associate array containing the five number summary for each
* treatment level: minimum value, first quartile value, median value,
* third quartile value, and max value. These numbers are used to specify
* box plot components for each treatment level.
*/
function getFiveNumberSummary() {
global $db;
$sql = " SELECT $this->treatment, $this->response ";
$sql .= " FROM $this->table ";
$sql .= " ORDER BY $this->treatment, $this->response ASC ";
$result = $db->query($sql);
if (DB::isError($result)) {
die($result->getMessage());
} else {
while ($row = $result->fetchRow()) {
$level = $row[0];
$response = $row[1];
$columns[$level][] = $response;
}
foreach($this->levels AS $level) {
$summary[$level]["min"] = $columns[$level][0];
$n = $this->n[$level];
if ($n % 2) {
$half = $n/2;
$q1_lo = floor($half / 2) - 1;
$q1_hi = floor($half / 2);
$summary[$level]["q1"] = ( $columns[$level][$q1_lo] + $columns[$level][$q2_hi] ) / 2;
$q2 = ceil($half);
$summary[$level]["median"] = $columns[$level][$q2];
$q3_lo = $half + $q1 - 1;
$q3_hi = $half + $q1;
$summary[$level]["q3"] = ( $columns[$level][$q3_lo] + $columns[$level][$q3_hi] ) / 2;
} else {
$half = $n/2;
$q1 = floor(($half) / 2);
$summary[$level]["q1"] = $columns[$level][$q1];
$q2_lo = $half - 1;
$q2_hi = $half;
$summary[$level]["median"] = ( $columns[$level][$q2_lo] + $columns[$level][$q2_hi] ) / 2;
$q3 = $half + $q1;
$summary[$level]["q3"] = $columns[$level][$q3];
}
$summary[$level]["max"] = $columns[$level][$n-1];
}
return $summary;
}
}
/*
* Compute differences between treatment means.
*
* @see http://www.uwsp.edu/psych/cw/statmanual/posthocs.html
*/
function getMeanDifferences() {
$means = $this->means;
unset($means["grand"]); // remove grand mean
asort($means); // sort lowest to highest
$this->diff_means = array_values($means);
$this->diff_labels = array_keys($means);
for ($i = $this->num_levels - 1; $i >= 0; $i--) {
for ($j=0; $j < $this->num_levels; $j++) {
if ($i < $j) {
$this->diffs[$j][$i] = abs($this->diff_means[$j] - $this->diff_means[$i]);
}
}
}
}
/**
* Show mean differences in an extensible tabular format.
*/
function showMeanDifferences($format="%.2f") {
$this->getMeanDifferences();
?>
<table>
<tr>
<th>Mean Differences</th>
</tr>
<tr>
<td>
<table border='1' cellpadding='3' cellspacing='0'>
<tr bgcolor='#ffffcc'>
<td> </td>
<?php
for ($i=0; $i < $this->num_levels; $i++) {
$mean = sprintf($format, $this->diff_means[$i]);
print "<td><b>".ucfirst($this->diff_labels[$i])." [$mean]</b></td>";
}
?>
</tr>
<?php
for ($i=0; $i < $this->num_levels; $i++) {
print "<tr><td align='right' bgcolor='#ffffcc'><b>".ucfirst($this->diff_labels[$i])."</b></td>";
for ($j=0; $j < $this->num_levels; $j++) {
print "<td align='right'>";
if(isset($this->diffs[$j][$i])) {
print $this->diffs[$j][$i];
} else {
print " ";
}
echo "</td>";
}
echo "</tr>";
}
?>
</table>
</td>
</tr>
</table>
<?php
}
/*
* Output ANOVA source table.
*/
function showSourceTable($format="%.2f") {
?>
<table>
<tr>
<th>ANOVA Source Table</th>
</tr>
<tr>
<td align='center'>
<table border='1' cellpadding='3' cellspacing='0'>
<tr bgcolor='ffffcc'>
<td><b>Source</b></td>
<td align='center'><b>SS</b></td>
<td align='center'><b>df</b></td>
<td align='center'><b>MS</b></td>
<td align='center'><b>F</b></td>
<td align='center'><b>p</b></td>
</tr>
<tr>
<td>Between</td>
<td align='right'><?php printf($format, $this->ss["between"]); ?></td>
<td align='center'><?php print $this->df["between"]; ?></td>
<td align='right'><?php printf($format, $this->ms["between"]); ?></td>
<td><?php printf($format, $this->f); ?></td>
<td><?php printf($format, $this->p); ?></td>
</tr>
<tr>
<td>Within</td>
<td align='right'><?php printf($format, $this->ss["within"]); ?></td>
<td align='center'><?php print $this->df["within"]; ?></td>
<td align='right'><?php printf($format, $this->ms["within"]); ?></td>
<td> </td>
<td> </td>
</tr>
<tr bgcolor='ffffcc'>
<td colspan='6'>
<small>
<?php
$crit = sprintf($format, $this->crit);
print "Critical F($this->alpha, ". $this->df["between"] .", ". $this->df["within"] .") is $crit. ";
?>
</small>
</td>
</tr>
</table>
</td>
</tr>
</table>
<?php
}
/*
* Output basic descriptive statistics (means, stdev, n).
*/
function showDescriptiveStatistics($format="%.2f") {
$treatment = ucfirst($this->treatment);
?>
<table>
<tr>
<th><?php echo "$treatment Levels"; ?></th>
</tr>
<tr>
<td>
<table border='1' cellspacing='0' cellpadding='3'>
<tr bgcolor='ffffcc'>
<th> </th>
<?php
foreach ($this->levels as $level) {
print "<th>$level</th>";
}
?>
</tr>
<tr>
<th>mean</th>
<?php
foreach ($this->levels as $level) {
$mean = sprintf($format, $this->means[$level]);
print "<td align='right'>$mean</td>";
}
?>
</tr>
<tr>
<th>stdev</th>
<?php
foreach ($this->levels as $level) {
$stdev = sprintf($format, sqrt($this->ss[$level] / $this->n[$level]));
print "<td align='right'>$stdev</td>";
}
?>
</tr>
<tr>
<th>n</th>
<?php
foreach ($this->levels as $level) {
print "<td align='right'>".$this->n[$level]."</td>";
}
?>
</tr>
</table>
</td>
</tr>
</table>
<?php
}
/*
* Output contents of database table.
*/
function showRawData() {
global $db;
$data = $db->tableInfo($this->table, DB_TABLEINFO_ORDER);
$columns = array_keys($data["order"]);
$num_columns = count($columns);
?>
<table cellspacing='0' cellpadding='0'>
<tr>
<td>
<table border='1' cellspacing='0' cellpadding='3'>
<?php
print "<tr bgcolor='ffffcc'>";
for ($i=0; $i < $num_columns; $i++) {
print "<td align='center'><b>".$columns[$i]."</b></td>";
}
print "</tr>";
$fields = implode(",", $columns);
$sql = " SELECT $fields FROM $this->table ";
$result = $db->query($sql);
if (DB::isError($result)) {
die($result->getMessage());
} else {
while ($row = $result->fetchRow()) {
print "<tr>";
foreach($row as $key=>$value) {
print "<td>$value</td>";
}
print "</tr>";
}
}
?>
</table>
</td>
</tr>
</table>
<?php
}
/**
* The showBoxPlot method is a wrapper for JPGRAPH methods.
* The JPGRAPH constant defining to the root of the JPGraph
* library is specified in the config.php file.
*
* I only do very basic $params array handling in this method. There
* is a lot of room for improvement in making parameter handling more
* extensive (i.e., so you have more control over aspects of your plot)
* and more intelligent.
*/
function showBoxPlot($params=array()) {
include_once JPGRAPH . "/src/jpgraph.php";
include_once JPGRAPH . "/src/jpgraph_stock.php";
$summary = $this->getFiveNumberSummary();
$yData = array();
foreach($this->levels AS $level) {
// Data must be in the format : q1, q3, min, max, median.
$plot_parts = array("q1","q3","min","max","median");
foreach($plot_parts AS $part) {
$yData[] = $summary[$level][$part];
}
}
$figureTitle = $params["figureTitle"];
$xTitle = $params["xTitle"];
$yTitle = $params["yTitle"];
if(!isset($figureTitle))
$figureTitle = "Figure 1";
if(!isset($xTitle))
$xTitle = "x-level";
if(!isset($yTitle))
$yTitle = "y-level";
$plotWidth = 400;
$plotHeight = 300;
$yMin = $params["yMin"];
$yMax = $params["yMax"];
$yTicks = $params["yTicks"];
$yMargin = 35;
$xMargin = 15;
$xLabels = $this->levels;
$graph = new Graph($plotWidth, $plotHeight);
$graph->SetFrame(false);
$graph->SetMarginColor('white');
$graph->title->Set($figureTitle);
$graph->SetScale("textlin", $yMin, $yMax);
$graph->yscale->ticks->Set($yTicks);
$graph->xaxis->SetTickLabels($xLabels);
$graph->xaxis->SetTitle($xTitle,"center");
$graph->xaxis->SetTitleMargin($xMargin);
$graph->yaxis->SetTitle($yTitle, "center");
$graph->yaxis->SetTitleMargin($yMargin);
// Create a new box plot.
$bp = new BoxPlot($yData);
// Indent bars so they dont start and end at the edge of the plot area.
$bp->SetCenter();
// Width of the bars in pixels.
$bp->SetWidth(25);
// Set bar colors
$bp->SetColor('black','white','black','white');
// Set median colors
$bp->SetMedianColor('black','black');
$graph->Add($bp);
if (isset($params["outputFile"])) {
$outputFile = $params["outputFile"];
$graph_name = "temp/$outputfile";
} else {
$graph_name = "temp/boxplot.png";
}
$graph->Stroke($graph_name);
echo "<img src='$graph_name' vspace='15' alt='$figureTitle'>";
}
}
?>