E90ba01aea329491b312280d1eea606b

I need to add Wikipedia's informations on my website... I looked at wikimedia documentation for API support and i found this: http://www.mediawiki.org/wiki/API. Now the problem is that the output text i receive in xml is formattet with wikimedia code and i need to convert it in plain html... In the Wikimedia package i found a page, Parser.php, that include some functions that help me to convert something.

Demo of the script as you see here: http://www.federicopepe.com/test/test2.php?query=Metallica&lang=en

I need to delete or format the content between {{ and }} and between <ref> and </ref>... Maybe with a regex?

<?php

function fetch($url,$start,$end){
	$page = file_get_contents($url);
 	$s1=explode($start, $page);
 	$s2=explode($end, $page);
 	$page=str_replace($s1[0], '', $page);
 	$page=str_replace($s2[1], '', $page);
 	return $page;
}

$query = $_GET['query'];

if($_GET['lang'] != '')
{
	$lang = $_GET['lang'];
}
else
{
	$lang = 'it';
}

$xml = fetch("http://".$lang.".wikipedia.org/w/api.php?action=query&prop=revisions&titles=".$query."&rvprop=content&format=xml","<rev>","</rev>");

/* THIS FUNCTION WAS IN PARSER.PHP */
function doHeadings($text)
{
	for ( $i = 6; $i >= 1; --$i ) 
	{
		$h = str_repeat( '=', $i );
		$text = preg_replace( "/^{$h}(.+){$h}\\s*$/m","<h{$i}>\\1</h{$i}>\\2", $text );
	}
		
	return $text;
}
/* THIS FUNCTION WAS IN PARSER.PHP */
function doAllQuotes($text)
{
	$outtext = '';
	$lines = explode( "\n", $text );
	foreach ( $lines as $line ) {
		$outtext .= doQuotes ( $line ) . "\n";
	}
	$outtext = substr($outtext, 0,-1);

	return $outtext;
}
/* THIS FUNCTION WAS IN PARSER.PHP */
function doQuotes( $text ) {
		$arr = preg_split( "/(''+)/", $text, -1, PREG_SPLIT_DELIM_CAPTURE );
		if ( count( $arr ) == 1 )
			return $text;
		else
		{
			# First, do some preliminary work. This may shift some apostrophes from
			# being mark-up to being text. It also counts the number of occurrences
			# of bold and italics mark-ups.
			$i = 0;
			$numbold = 0;
			$numitalics = 0;
			foreach ( $arr as $r )
			{
				if ( ( $i % 2 ) == 1 )
				{
					# If there are ever four apostrophes, assume the first is supposed to
					# be text, and the remaining three constitute mark-up for bold text.
					if ( strlen( $arr[$i] ) == 4 )
					{
						$arr[$i-1] .= "'";
						$arr[$i] = "'''";
					}
					# If there are more than 5 apostrophes in a row, assume they're all
					# text except for the last 5.
					else if ( strlen( $arr[$i] ) > 5 )
					{
						$arr[$i-1] .= str_repeat( "'", strlen( $arr[$i] ) - 5 );
						$arr[$i] = "'''''";
					}
					# Count the number of occurrences of bold and italics mark-ups.
					# We are not counting sequences of five apostrophes.
					if ( strlen( $arr[$i] ) == 2 )      { $numitalics++;             }
					else if ( strlen( $arr[$i] ) == 3 ) { $numbold++;                }
					else if ( strlen( $arr[$i] ) == 5 ) { $numitalics++; $numbold++; }
				}
				$i++;
			}

			# If there is an odd number of both bold and italics, it is likely
			# that one of the bold ones was meant to be an apostrophe followed
			# by italics. Which one we cannot know for certain, but it is more
			# likely to be one that has a single-letter word before it.
			if ( ( $numbold % 2 == 1 ) && ( $numitalics % 2 == 1 ) )
			{
				$i = 0;
				$firstsingleletterword = -1;
				$firstmultiletterword = -1;
				$firstspace = -1;
				foreach ( $arr as $r )
				{
					if ( ( $i % 2 == 1 ) and ( strlen( $r ) == 3 ) )
					{
						$x1 = substr ($arr[$i-1], -1);
						$x2 = substr ($arr[$i-1], -2, 1);
						if ($x1 == ' ') {
							if ($firstspace == -1) $firstspace = $i;
						} else if ($x2 == ' ') {
							if ($firstsingleletterword == -1) $firstsingleletterword = $i;
						} else {
							if ($firstmultiletterword == -1) $firstmultiletterword = $i;
						}
					}
					$i++;
				}

				# If there is a single-letter word, use it!
				if ($firstsingleletterword > -1)
				{
					$arr [ $firstsingleletterword ] = "''";
					$arr [ $firstsingleletterword-1 ] .= "'";
				}
				# If not, but there's a multi-letter word, use that one.
				else if ($firstmultiletterword > -1)
				{
					$arr [ $firstmultiletterword ] = "''";
					$arr [ $firstmultiletterword-1 ] .= "'";
				}
				# ... otherwise use the first one that has neither.
				# (notice that it is possible for all three to be -1 if, for example,
				# there is only one pentuple-apostrophe in the line)
				else if ($firstspace > -1)
				{
					$arr [ $firstspace ] = "''";
					$arr [ $firstspace-1 ] .= "'";
				}
			}

			# Now let's actually convert our apostrophic mush to HTML!
			$output = '';
			$buffer = '';
			$state = '';
			$i = 0;
			foreach ($arr as $r)
			{
				if (($i % 2) == 0)
				{
					if ($state == 'both')
						$buffer .= $r;
					else
						$output .= $r;
				}
				else
				{
					if (strlen ($r) == 2)
					{
						if ($state == 'i')
						{ $output .= '</i>'; $state = ''; }
						else if ($state == 'bi')
						{ $output .= '</i>'; $state = 'b'; }
						else if ($state == 'ib')
						{ $output .= '</b></i><b>'; $state = 'b'; }
						else if ($state == 'both')
						{ $output .= '<b><i>'.$buffer.'</i>'; $state = 'b'; }
						else # $state can be 'b' or ''
						{ $output .= '<i>'; $state .= 'i'; }
					}
					else if (strlen ($r) == 3)
					{
						if ($state == 'b')
						{ $output .= '</b>'; $state = ''; }
						else if ($state == 'bi')
						{ $output .= '</i></b><i>'; $state = 'i'; }
						else if ($state == 'ib')
						{ $output .= '</b>'; $state = 'i'; }
						else if ($state == 'both')
						{ $output .= '<i><b>'.$buffer.'</b>'; $state = 'i'; }
						else # $state can be 'i' or ''
						{ $output .= '<b>'; $state .= 'b'; }
					}
					else if (strlen ($r) == 5)
					{
						if ($state == 'b')
						{ $output .= '</b><i>'; $state = 'i'; }
						else if ($state == 'i')
						{ $output .= '</i><b>'; $state = 'b'; }
						else if ($state == 'bi')
						{ $output .= '</i></b>'; $state = ''; }
						else if ($state == 'ib')
						{ $output .= '</b></i>'; $state = ''; }
						else if ($state == 'both')
						{ $output .= '<i><b>'.$buffer.'</b></i>'; $state = ''; }
						else # ($state == '')
						{ $buffer = ''; $state = 'both'; }
					}
				}
				$i++;
			}
			# Now close all remaining tags.  Notice that the order is important.
			if ($state == 'b' || $state == 'ib')
				$output .= '</b>';
			if ($state == 'i' || $state == 'bi' || $state == 'ib')
				$output .= '</i>';
			if ($state == 'bi')
				$output .= '</b>';
			# There might be lonely ''''', so make sure we have a buffer
			if ($state == 'both' && $buffer)
				$output .= '<b><i>'.$buffer.'</i></b>';
			return $output;
		}
	}
	
$xml = doHeadings($xml);
$xml = doAllQuotes($xml);
$xml = str_replace('[[','',$xml); /* WIKIMEDIA IDENTIFY THE CONTENT BETWEEN [[ and ]] AS A LINK AND I REMOVE IT. */
$xml = str_replace(']]','',$xml);

echo $xml;exit;
?>

Refactorings

No refactoring yet !

3287d5bb845ffffe9a08b5f26c2fac71

catbert

August 30, 2008, August 30, 2008 23:09, permalink

No rating. Login to rate!

well, you could specify "&format=xml" in API query string, then unserialize the output. you would simply get this content packed into an array.

9b5f1035b1af18076f994fc1a79cad81

Gnets

June 13, 2011, June 13, 2011 04:23, permalink

No rating. Login to rate!
9006e1b8fc7818a86be328de57fdd66e

Teleskope

May 26, 2011, May 26, 2011 13:12, permalink

No rating. Login to rate!
03c9bdd6f17073a693886362c42329f7

Michal

October 8, 2011, October 08, 2011 23:29, permalink

No rating. Login to rate!

You may try

<img>http://www.bookfan.eu/blog/images/logo/bookfan_logo_200x60px.png</img>

Your refactoring





Format Copy from initial code

or Cancel