下面是我根据网上找的一个字典档, 简易实现的一个分词程序. 
//计算时间
function getmicrotime(){ 
 list($usec, $sec) = explode(" ",microtime()); 
 return ((float)$usec + (float)$sec); 
} 
$time_start = getmicrotime(); 
//词典类
class ch_dictionary {
 var $_id; 
 function ch_dictionary($fname = "") {
 if ($fname != "") {
 $this->load($fname);
 }
 } 
 // 根据文件名载入字典 (gdbm数据档案)
 function load($fname) {
 $this->_id = dba_popen($fname, "r", "gdbm");
 if (!$this->_id) {
 echo "failed to open the dictionary.($fname)
\n";
 exit;
 }
 } 
 // 根据词语返回频率, 不存在返回-1
 function find($word) {
 $freq = dba_fetch($word, $this->_id);
 if (is_bool($freq)) $freq = -1;
 return $freq;
 }
} 
// 分词类: (逆向)
// 先将输入的字串正向切成句子, 然后一句一句的分词, 返回由词组成的数组.
class ch_word_split {
 var $_mb_mark_list; // 常见切分句子的全角标点
 var $_word_maxlen; // 单个词最大可能长度(汉字字数)
 var $_dic; // 词典...
 var $_ignore_mark; // true or false
 
 function ch_word_split () {
 $this->_mb_mark_list = array(","," ","。","!","?",":","……","、","“","”","《","》","(",")");
 $this->_word_maxlen = 12; // 12个汉字
 $this->_dic = NULL;
 $this->_ignore_mark = true;
 } 
 // 设定字典
 function set_dic($fname) {
 $this->_dic = new ch_dictionary($fname);
 } 
 function set_ignore_mark($set) {
 if (is_bool($set)) $this->_ignore_mark = $set;
 } 
 // 将字串切成句子再加以切分成词
 function string_split($str, $func = "") { 
 $ret = array();
 
 if ($func == "" || !function_exists($func)) $func = ""; 
 
 $len = strlen($str);
 $qtr = ""; 
 for ($i = 0; $i < $len; $i++) {
 $char = $str[$i]; 
 if (ord($char) < 0xa1) {
 // 读取到一个半角字符
 if (!empty($qtr)) {
 $tmp = $this->_sen_split($qtr);
 $qtr = ""; 
 if ($func != "") call_user_func($func, $tmp); 
 else $ret = array_merge($ret, $tmp); 
 } 
 // 如果是单词或数字. 根据 char 将数据读取到 >= 0xa1为止
 if ($this->_is_alnum($char)) {
 do {
 if (($i+1) >= $len) break;
 $char2 = substr($str, $i + 1, 1);
 if (!$this->_is_alnum($char2)) break; 
 $char .= $char2;
 $i++;
 } while (1); 
 if ($func != "") call_user_func($func, array($char));
 else $ret[] = $char; 
 }
 elseif ($char == ' ' || $char == "\t") {
 // nothing.
 continue;
 }
 elseif (!$this->_ignore_mark) {
 if ($func != "") call_user_func($func, array($char));
 else $ret[] = $char; 
 }
 }
 else {
 // 双字节字符.
 $i++;
 $char .= $str[$i];
 
 if (in_array($char, $this->_mb_mark_list)) {
 if (!empty($qtr)) {
 $tmp = $this->_sen_split($qtr);
 $qtr = ""; 
 if ($func != "") call_user_func($func, $tmp);
 else $ret = array_merge($ret, $tmp);
 } 
 if (!$this->_ignore_mark) {
 if ($func != "") call_user_func($func, array($char));
 else $ret[] = $char;
 }
 }
 else {
 $qtr .= $char;
 }
 }
 }
 
 if (strlen($qtr) > 0) {
 $tmp = $this->_sen_split($qtr); 
 if ($func != "") call_user_func($func, $tmp); 
 else $ret = array_merge($ret, $tmp); 
 } 
 // return value
 if ($func == "") {
 return $ret;
 }
 else {
 return true;
 }
 } 
 // 将句子切成词, 逆向
 function _sen_split($sen) {
 $len = strlen($sen) / 2;
 $ret = array(); 
 for ($i = $len - 1; $i >= 0; $i--) {
 // 如: 这是一个分词程序
 
 // 先取得最后一个字
 $w = substr($sen, $i * 2, 2); 
 // 最终的词长
 $wlen = 1;
 
 // 开始逆向匹配到最大长度.
 $lf = 0; // last freq
 for ($j = 1; $j <= $this->_word_maxlen; $j++) {
 $o = $i - $j;
 if ($o < 0) break;
 $w2 = substr($sen, $o * 2, ($j + 1) * 2);
 
 $tmp_f = $this->_dic->find($w2);
 //echo "{$i}.{$j}: $w2 (f: $tmp_f)\n";
 if ($tmp_f > $lf) {
 $lf = $tmp_f;
 $wlen = $j + 1;
 $w = $w2;
 }
 }
 // 根据 $wlen 将 $i 偏移了
 $i = $i - $wlen + 1;
 array_push($ret, $w);
 } 
 $ret = array_reverse($ret);
 return $ret;
 } 
 // 判断字符是不是 字母数字_- [0-9a-z_-]
 function _is_alnum($char) {
 $ord = ord($char);
 if ($ord == 45 || $ord == 95 || ($ord >= 48 && $ord <= 57))
 return true;
 if (($ord >= 97 && $ord <= 122) || ($ord >= 65 && $ord <= 90))
 return true;
 return false;
 }
} 
// 分词后的回调函数
function call_back($ar) { 
 foreach ($ar as $tmp) {
 echo $tmp . " ";
 //flush();
 }
} 
// 实例(如果没有输入就从 sample.txt中读取): 
$wp = new ch_word_split();
$wp->set_dic("dic.db"); 
if (!isset($_REQUEST['testdat']) || empty($_REQUEST['testdat'])) {
 $data = file_get_contents("sample.txt");
}
else {
 $data = & $_REQUEST['testdat'];
} 
// output
echo "
简易分词演示
\n";
echo "
\n";
echo "分词结果(" . strlen($data) . " chars): 
\n
\n本次分词耗时: $time seconds 
\n";
?>