2012-06-20 21:39:23 -07:00

508 lines
12 KiB
PHP
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
/* WikiParser
* Version 1.0
* Copyright 2005, Steve Blinch
* http://code.blitzaffe.com
*
* This class parses and returns the HTML representation of a document containing
* basic MediaWiki-style wiki markup.
*
*
* USAGE
*
* Refer to class_WikiRetriever.php (which uses this script to parse fetched
* wiki documents) for an example.
*
*
* LICENSE
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
*
*/
class Cana_Wiki {
function Cana_Wiki() {
$this->reference_wiki = '';
$this->image_uri = '';
$this->ignore_images = true;
}
function handle_sections($matches) {
$level = strlen($matches[1]);
$content = $matches[2];
$this->stop = true;
// avoid accidental run-on emphasis
return $this->emphasize_off() . "\n\n<h{$level}>{$content}</h{$level}>\n\n";
}
function handle_newline($matches) {
if ($this->suppress_linebreaks) return $this->emphasize_off();
$this->stop = true;
// avoid accidental run-on emphasis
return $this->emphasize_off() . "<br /><br />";
}
function handle_list($matches,$close=false) {
$listtypes = [
'*'=>'ul',
'#'=>'ol',
];
$output = "";
$newlevel = ($close) ? 0 : strlen($matches[1]);
while ($this->list_level!=$newlevel) {
$listchar = substr($matches[1],-1);
$listtype = $listtypes[$listchar];
//$output .= "[".$this->list_level."->".$newlevel."]";
if ($this->list_level>$newlevel) {
$listtype = '/'.array_pop($this->list_level_types);
$this->list_level--;
} else {
$this->list_level++;
array_push($this->list_level_types,$listtype);
}
$output .= "\n<{$listtype}>\n";
}
if ($close) return $output;
$output .= "<li>".$matches[2]."</li>\n";
return $output;
}
function handle_definitionlist($matches,$close=false) {
if ($close) {
$this->deflist = false;
return "</dl>\n";
}
$output = "";
if (!$this->deflist) $output .= "<dl>\n";
$this->deflist = true;
switch($matches[1]) {
case ';':
$term = $matches[2];
$p = strpos($term,' :');
if ($p!==false) {
list($term,$definition) = explode(':',$term);
$output .= "<dt>{$term}</dt><dd>{$definition}</dd>";
} else {
$output .= "<dt>{$term}</dt>";
}
break;
case ':':
$definition = $matches[2];
$output .= "<dd>{$definition}</dd>\n";
break;
}
return $output;
}
function handle_preformat($matches,$close=false) {
if ($close) {
$this->preformat = false;
return "</pre>\n";
}
$this->stop_all = true;
$output = "";
if (!$this->preformat) $output .= "<pre>";
$this->preformat = true;
$output .= $matches[1];
return $output."\n";
}
function handle_horizontalrule($matches) {
return "<hr />";
}
function wiki_link($topic) {
return ucfirst(str_replace(' ','_',$topic));
}
function handle_image($href,$title,$options) {
if ($this->ignore_images) return "";
if (!$this->image_uri) return $title;
$href = $this->image_uri . $href;
$imagetag = sprintf(
'<img src="%s" alt="%s" />',
$href,
$title
);
foreach ($options as $k=>$option) {
switch($option) {
case 'frame':
$imagetag = sprintf(
'<div style="float: right; background-color: #F5F5F5; border: 1px solid #D0D0D0; padding: 2px">'.
'%s'.
'<div>%s</div>'.
'</div>',
$imagetag,
$title
);
break;
case 'right':
$imagetag = sprintf(
'<div style="float: right">%s</div>',
$imagetag
);
break;
}
}
return $imagetag;
}
function handle_internallink($matches) {
//var_dump($matches);
$nolink = false;
$href = $matches[4];
$title = $matches[6] ? $matches[6] : $href.$matches[7];
$namespace = $matches[3];
if ($namespace=='Image') {
$options = explode('|',$title);
$title = array_pop($options);
return $this->handle_image($href,$title,$options);
}
$title = preg_replace('/\(.*?\)/','',$title);
$title = preg_replace('/^.*?\:/','',$title);
if ($this->reference_wiki) {
$href = $this->reference_wiki.($namespace?$namespace.':':'').$this->wiki_link($href);
} else {
$nolink = true;
}
if ($nolink) return $title;
return sprintf(
'<a href="%s"%s>%s</a>',
$href,
($newwindow?' target="_blank"':''),
$title
);
}
function handle_externallink($matches) {
$href = $matches[2];
$title = $matches[3];
if (!$title) {
$this->linknumber++;
$title = "[{$this->linknumber}]";
}
$newwindow = true;
return sprintf(
'<a href="%s"%s>%s</a>',
$href,
($newwindow?' target="_blank"':''),
$title
);
}
function emphasize($amount) {
$amounts = [
2=>['<em>','</em>'],
3=>['<strong>','</strong>'],
4=>['<strong>','</strong>'],
5=>['<em><strong>','</strong></em>'],
];
$output = "";
// handle cases where emphasized phrases end in an apostrophe, eg: ''somethin'''
// should read <em>somethin'</em> rather than <em>somethin<strong>
if ( (!$this->emphasis[$amount]) && ($this->emphasis[$amount-1]) ) {
$amount--;
$output = "'";
}
$output .= $amounts[$amount][(int) $this->emphasis[$amount]];
$this->emphasis[$amount] = !$this->emphasis[$amount];
return $output;
}
function handle_emphasize($matches) {
$amount = strlen($matches[1]);
return $this->emphasize($amount);
}
function emphasize_off() {
$output = "";
foreach ($this->emphasis as $amount=>$state) {
if ($state) $output .= $this->emphasize($amount);
}
return $output;
}
function handle_eliminate($matches) {
return "";
}
function handle_variable($matches) {
switch($matches[2]) {
case 'CURRENTMONTH': return date('m');
case 'CURRENTMONTHNAMEGEN':
case 'CURRENTMONTHNAME': return date('F');
case 'CURRENTDAY': return date('d');
case 'CURRENTDAYNAME': return date('l');
case 'CURRENTYEAR': return date('Y');
case 'CURRENTTIME': return date('H:i');
case 'NUMBEROFARTICLES': return 0;
case 'PAGENAME': return $this->page_title;
case 'NAMESPACE': return 'None';
case 'SITENAME': return $_SERVER['HTTP_HOST'];
default: return '';
}
}
function parse_line($line) {
$line_regexes = [
'preformat'=>'^\s(.*?)$',
'definitionlist'=>'^([\;\:])\s*(.*?)$',
'newline'=>'^$',
'list'=>'^([\*\#]+)(.*?)$',
'sections'=>'^(={1,6})(.*?)(={1,6})$',
'horizontalrule'=>'^----$',
];
$char_regexes = [
// 'link'=>'(\[\[((.*?)\:)?(.*?)(\|(.*?))?\]\]([a-z]+)?)',
'internallink'=>'('.
'\[\['. // opening brackets
'(([^\]]*?)\:)?'. // namespace (if any)
'([^\]]*?)'. // target
'(\|([^\]]*?))?'. // title (if any)
'\]\]'. // closing brackets
'([a-z]+)?'. // any suffixes
')',
'externallink'=>'('.
'\['.
'([^\]]*?)'.
'(\s+[^\]]*?)?'.
'\]'.
')',
'emphasize'=>'(\'{2,5})',
'eliminate'=>'(__TOC__|__NOTOC__|__NOEDITSECTION__)',
'variable'=>'('. '\{\{' . '([^\}]*?)' . '\}\}' . ')',
];
$this->stop = false;
$this->stop_all = false;
$called = [];
$line = rtrim($line);
foreach ($line_regexes as $func=>$regex) {
if (preg_match("/$regex/i",$line,$matches)) {
$called[$func] = true;
$func = "handle_".$func;
$line = $this->$func($matches);
if ($this->stop || $this->stop_all) break;
}
}
if (!$this->stop_all) {
$this->stop = false;
foreach ($char_regexes as $func=>$regex) {
$line = preg_replace_callback("/$regex/i",[&$this,"handle_".$func],$line);
if ($this->stop) break;
}
}
$isline = strlen(trim($line))>0;
// if this wasn't a list item, and we are in a list, close the list tag(s)
if (($this->list_level>0) && !$called['list']) $line = $this->handle_list(false,true) . $line;
if ($this->deflist && !$called['definitionlist']) $line = $this->handle_definitionlist(false,true) . $line;
if ($this->preformat && !$called['preformat']) $line = $this->handle_preformat(false,true) . $line;
// suppress linebreaks for the next line if we just displayed one; otherwise re-enable them
if ($isline) $this->suppress_linebreaks = ($called['newline'] || $called['sections']);
return $line;
}
function parse($text,$title = '') {
$this->nowikis
= $this->code
= $this->list_level_types
= [];
$this->list_level
= $this->linknumber
= 0;
$this->deflist
= $this->suppress_linebreaks
= $this->redirect
= false;
$this->page_title = $title;
$output = '';
$text = preg_replace_callback('/<nowiki>([\s\S]*?)<\/nowiki>/i',[&$this,'handle_save_nowiki'],$text);
$text = preg_replace_callback('/<code>([\s\S]*?)<\/code>/i',[&$this,'handle_save_code'],$text);
$lines = explode("\n",$text);
if (preg_match('/^\#REDIRECT\s+\[\[(.*?)\]\]$/',trim($lines[0]),$matches)) {
$this->redirect = $matches[1];
}
foreach ($lines as $k=>$line) {
$line = $this->parse_line($line);
$output .= $line;
}
$out = explode(' ',$output);
foreach ($out as $i => $outitem) {
$out[$i] = preg_replace('/(http|ftp|https)\:\/\/[\w\.\/;:-=\+&!@#$%^&*\(\)_\,\"\'\[\]\{\}\?\|]+/i','<a href="$0" target="_blank">$0</a>',$outitem).' ';
}
$output = implode(' ',$out);
$output = preg_replace_callback('/<nowiki><\/nowiki>/i',[&$this,'handle_restore_nowiki'],$output);
$output = preg_replace_callback('/<code><\/code>/i',[&$this,'handle_restore_code'],$output);
return $output;
}
function handle_save_nowiki($matches) {
array_push($this->nowikis,$matches[1]);
return '<nowiki></nowiki>';
}
function handle_restore_nowiki($matches) {
return array_pop($this->nowikis);
}
function handle_save_code($matches) {
array_push($this->code,$matches[1]);
return '<code></code>';
}
function handle_restore_code($matches) {
return '<pre class="prettyprint linenums">'.htmlspecialchars(array_shift($this->code)).'</pre>';
}
public static function output($content) {
$wiki = new Cana_Wiki;
ob_start();
$content = $wiki->parse($content);
ob_end_clean();
return $content;
}
public static function strip($content) {
$content = stripslashes($content);
$content = str_replace('</div>',"</div>\n",$content);
$content = htmlspecialchars_decode($content);
$content = strip_tags($content,'<b><li><i><s><u><a><ul><ol><li><code><?php><%asp><?>');
$replace = [
'/\<ul\>.*?\<\/ul\>/i',
'/\<ol\>.*?\<\/ol\>/i'
];
$content = preg_replace_callback($replace,'Cana_Wiki::stripReplace',$content);
$find = [
'/\<(b|li|i|s|u|a|li|code).*?(style|onclick|onmouseover|onblur|onactive).*?\>/i',
'/“/', // left side double smart quote
'/”/', // right side double smart quote
'/‘/', // left side single smart quote
'/’/', // right side single smart quote
'/…/', // elipsis
'/—/', // em dash
'/–/', // en dash
'/ヤ|ユ/',
'/メ|モ/',
'/|/',
'/“|”/',
'/\<i\>(.*?)\<\/i\>/i',
'/\<b\>(.*?)\<\/b\>/i'
];
$replace = [
'',
'"',
'"',
"'",
"'",
"...",
"-",
"-",
'¥',
'"',
"'",
'"',
"''\\1''",
"'''\\1'''"
];
$content = preg_replace($find, $replace, $content);
return $content;
}
public static function stripReplace($content) {
$content = $content[0];
if (strpos($content,'<ol>') !== false) {
$type = 'ol';
} else {
$type = 'ul';
}
$content = strip_tags($content, '<li>');
$content = preg_replace('/\<\/li\>/i',"\n",$content);
switch ($type) {
case 'ol':
$content = preg_replace('/\<li\>/i','# ',$content);
break;
case 'ul':
$content = preg_replace('/\<li\>/i','* ',$content);
break;
}
return $content;
}
}