<?php
function fetch($url,$start,$end){
$page = file_get_contents($url);
$s1=explode($start, $page);
$s2=explode($end, $page);
$page=str_replace($s1[0], '', $page);
$page=str_replace($s2[1], '', $page);
return $page;
}
$query = $_GET['query'];
if($_GET['lang'] != '')
{
$lang = $_GET['lang'];
}
else
{
$lang = 'it';
}
$xml = fetch("http://".$lang.".wikipedia.org/w/api.php?action=query&prop=revisions&titles=".$query."&rvprop=content&format=xml","<rev>","</rev>");
/* THIS FUNCTION WAS IN PARSER.PHP */
function doHeadings($text)
{
for ( $i = 6; $i >= 1; --$i )
{
$h = str_repeat( '=', $i );
$text = preg_replace( "/^{$h}(.+){$h}\\s*$/m","<h{$i}>\\1</h{$i}>\\2", $text );
}
return $text;
}
/* THIS FUNCTION WAS IN PARSER.PHP */
function doAllQuotes($text)
{
$outtext = '';
$lines = explode( "\n", $text );
foreach ( $lines as $line ) {
$outtext .= doQuotes ( $line ) . "\n";
}
$outtext = substr($outtext, 0,-1);
return $outtext;
}
/* THIS FUNCTION WAS IN PARSER.PHP */
function doQuotes( $text ) {
$arr = preg_split( "/(''+)/", $text, -1, PREG_SPLIT_DELIM_CAPTURE );
if ( count( $arr ) == 1 )
return $text;
else
{
# First, do some preliminary work. This may shift some apostrophes from
# being mark-up to being text. It also counts the number of occurrences
# of bold and italics mark-ups.
$i = 0;
$numbold = 0;
$numitalics = 0;
foreach ( $arr as $r )
{
if ( ( $i % 2 ) == 1 )
{
# If there are ever four apostrophes, assume the first is supposed to
# be text, and the remaining three constitute mark-up for bold text.
if ( strlen( $arr[$i] ) == 4 )
{
$arr[$i-1] .= "'";
$arr[$i] = "'''";
}
# If there are more than 5 apostrophes in a row, assume they're all
# text except for the last 5.
else if ( strlen( $arr[$i] ) > 5 )
{
$arr[$i-1] .= str_repeat( "'", strlen( $arr[$i] ) - 5 );
$arr[$i] = "'''''";
}
# Count the number of occurrences of bold and italics mark-ups.
# We are not counting sequences of five apostrophes.
if ( strlen( $arr[$i] ) == 2 ) { $numitalics++; }
else if ( strlen( $arr[$i] ) == 3 ) { $numbold++; }
else if ( strlen( $arr[$i] ) == 5 ) { $numitalics++; $numbold++; }
}
$i++;
}
# If there is an odd number of both bold and italics, it is likely
# that one of the bold ones was meant to be an apostrophe followed
# by italics. Which one we cannot know for certain, but it is more
# likely to be one that has a single-letter word before it.
if ( ( $numbold % 2 == 1 ) && ( $numitalics % 2 == 1 ) )
{
$i = 0;
$firstsingleletterword = -1;
$firstmultiletterword = -1;
$firstspace = -1;
foreach ( $arr as $r )
{
if ( ( $i % 2 == 1 ) and ( strlen( $r ) == 3 ) )
{
$x1 = substr ($arr[$i-1], -1);
$x2 = substr ($arr[$i-1], -2, 1);
if ($x1 == ' ') {
if ($firstspace == -1) $firstspace = $i;
} else if ($x2 == ' ') {
if ($firstsingleletterword == -1) $firstsingleletterword = $i;
} else {
if ($firstmultiletterword == -1) $firstmultiletterword = $i;
}
}
$i++;
}
# If there is a single-letter word, use it!
if ($firstsingleletterword > -1)
{
$arr [ $firstsingleletterword ] = "''";
$arr [ $firstsingleletterword-1 ] .= "'";
}
# If not, but there's a multi-letter word, use that one.
else if ($firstmultiletterword > -1)
{
$arr [ $firstmultiletterword ] = "''";
$arr [ $firstmultiletterword-1 ] .= "'";
}
# ... otherwise use the first one that has neither.
# (notice that it is possible for all three to be -1 if, for example,
# there is only one pentuple-apostrophe in the line)
else if ($firstspace > -1)
{
$arr [ $firstspace ] = "''";
$arr [ $firstspace-1 ] .= "'";
}
}
# Now let's actually convert our apostrophic mush to HTML!
$output = '';
$buffer = '';
$state = '';
$i = 0;
foreach ($arr as $r)
{
if (($i % 2) == 0)
{
if ($state == 'both')
$buffer .= $r;
else
$output .= $r;
}
else
{
if (strlen ($r) == 2)
{
if ($state == 'i')
{ $output .= '</i>'; $state = ''; }
else if ($state == 'bi')
{ $output .= '</i>'; $state = 'b'; }
else if ($state == 'ib')
{ $output .= '</b></i><b>'; $state = 'b'; }
else if ($state == 'both')
{ $output .= '<b><i>'.$buffer.'</i>'; $state = 'b'; }
else # $state can be 'b' or ''
{ $output .= '<i>'; $state .= 'i'; }
}
else if (strlen ($r) == 3)
{
if ($state == 'b')
{ $output .= '</b>'; $state = ''; }
else if ($state == 'bi')
{ $output .= '</i></b><i>'; $state = 'i'; }
else if ($state == 'ib')
{ $output .= '</b>'; $state = 'i'; }
else if ($state == 'both')
{ $output .= '<i><b>'.$buffer.'</b>'; $state = 'i'; }
else # $state can be 'i' or ''
{ $output .= '<b>'; $state .= 'b'; }
}
else if (strlen ($r) == 5)
{
if ($state == 'b')
{ $output .= '</b><i>'; $state = 'i'; }
else if ($state == 'i')
{ $output .= '</i><b>'; $state = 'b'; }
else if ($state == 'bi')
{ $output .= '</i></b>'; $state = ''; }
else if ($state == 'ib')
{ $output .= '</b></i>'; $state = ''; }
else if ($state == 'both')
{ $output .= '<i><b>'.$buffer.'</b></i>'; $state = ''; }
else # ($state == '')
{ $buffer = ''; $state = 'both'; }
}
}
$i++;
}
# Now close all remaining tags. Notice that the order is important.
if ($state == 'b' || $state == 'ib')
$output .= '</b>';
if ($state == 'i' || $state == 'bi' || $state == 'ib')
$output .= '</i>';
if ($state == 'bi')
$output .= '</b>';
# There might be lonely ''''', so make sure we have a buffer
if ($state == 'both' && $buffer)
$output .= '<b><i>'.$buffer.'</i></b>';
return $output;
}
}
$xml = doHeadings($xml);
$xml = doAllQuotes($xml);
$xml = str_replace('[[','',$xml); /* WIKIMEDIA IDENTIFY THE CONTENT BETWEEN [[ and ]] AS A LINK AND I REMOVE IT. */
$xml = str_replace(']]','',$xml);
echo $xml;exit;
?>
Refactorings
No refactoring yet !
catbert
August 30, 2008, August 30, 2008 23:09, permalink
well, you could specify "&format=xml" in API query string, then unserialize the output. you would simply get this content packed into an array.
I need to add Wikipedia's informations on my website... I looked at wikimedia documentation for API support and i found this: http://www.mediawiki.org/wiki/API. Now the problem is that the output text i receive in xml is formattet with wikimedia code and i need to convert it in plain html... In the Wikimedia package i found a page, Parser.php, that include some functions that help me to convert something.
Demo of the script as you see here: http://www.federicopepe.com/test/test2.php?query=Metallica&lang=en
I need to delete or format the content between {{ and }} and between <ref> and </ref>... Maybe with a regex?