<?php
class robots {
/* Function parsePathComponents
* parse through all path components
* resolves the cross platform slash issue
* eliminates extra redundant slashes
* @return associative array: ComponentName=>CorrespondentPath
*
* This function is borrowed from the manual at www.php.net
*/
/* Array */ function parsePathComponents($path,$endSlash=true,$base=false)
{
for(
$path = trim($path),
$slash = strstr(PHP_OS,'WIN') ? '\/' : '/',
$retArray = array(),
$str = $temp = "",
$x = 0;
$char = @$path{$x}; $x++)
{
if(!strstr($slash,$char)) $temp .= $char;
elseif($temp){
$str .= $temp;
$retArray[$temp] = $str.($endSlash ? $slash{0} : '');
$str .= $slash{0};
$temp = "";
}
}
$base&&$temp and $retArray[$temp] = $str.$temp;
return $retArray;
}
function is_valid_url ( $url )
{
$url = @parse_url($url);
if ( ! $url) {
return false;
}
$url = array_map('trim', $url);
$url['port'] = (!isset($url['port'])) ? 80 : (int)$url['port'];
$path = (isset($url['path'])) ? $url['path'] : '';
if ($path == '')
{
$path = '/';
}
$path .= ( isset ( $url['query'] ) ) ? "?$url[query]" : '';
if ( isset ( $url['host'] ) AND $url['host'] != gethostbyname ( $url['host'] ) )
{
$headers = get_headers("$url[scheme]://$url[host]:$url[port]$path");
$headers = ( is_array ( $headers ) ) ? implode ( "\n", $headers ) : $headers;
return ( bool ) preg_match ( '#^HTTP/.*\s+[(200|301|302)]+\s#i', $headers );
}
return false;
}
function allowed($url){
$user_agent = "*";
$urlparts = parse_url($url);
$url = 'http://'.$urlparts['host']."/robots.txt";
$to_check = $urlparts['path'];
$to_check = $this->parsePathComponents($urlparts['path'], true, true);
$url_status = $this->is_valid_url($url);
if ($url_status) {
$robot = file($url);
if (!$robot) {
$contents = getFileContents($url);
$file = $contents['file'];
$robot = explode("\n", $file);
}
/* Init permissions hash */
$permissionHash = array();
/* Init current agent */
$currentAgent = '';
/* Init returned disallows array */
$disallows = array();
/* Iterate over each line in robots file. */
foreach ($robot as $line) {
/**
* If the user agent is initialized, and it's not * or our
* user agent then we'll ignore it.
*/
if(!($currentAgent === '' || $currentAgent === '*'
|| $currentAgent === $user_agent)) {
continue;
}
/* Ignore any commented lines. */
if(strpos(trim($line), '#') === 0) {
continue;
} else {
/* Check for embedded comments, throw them out as well. */
$commentSeparationArray = explode('#', $line);
$line = $commentSeparationArray[0];
}
/* Extract key value pair from each line. */
list($key, $value) = explode(':', $line);
$value = rtrim(ltrim(trim($value),"/"), "/");
/* If we have a user agent line, then we can change the current agent. */
if(strtolower($key) == 'user-agent') {
$currentAgent = $value;
$permissionHash[$currentAgent]['allow'] = array();
$permissionHash[$currentAgent]['disallow'] = array();
/* If we have an allow directive, push it onto permission hash. */
} else if(strtolower($key) == 'allow') {
if($value != '')
$permissionHash[$currentAgent]['allow'][] = $value;
/* If we have a disallow directive, push it onto permission hash. */
} else if (strtolower($key) == 'disallow') {
if($value != '')
$permissionHash[$currentAgent]['disallow'][] = $value;
}
}
//var_dump($permissionHash);
/**
* If our user agent is explicitely listed, we use our disallows.
* Otherwise we default to the disallows given for all user agents.
*/
$disallows = (isset($permissionHash[$user_agent])
? $permissionHash[$user_agent]['disallow']
: $permissionHash['*']['disallow']);
/* Return the disallows */
//return $disallows;
foreach($to_check as $tc){
//var_dump($tc);
$tc = ltrim(rtrim($tc, "/"), "/");
//var_dump($tc);
//var_dump($disallows);
if((bool)array_search($tc, $disallows)){
// Der er fundet et match i disallows. Altså må der ikke crawles.
return false;
}
}
return true;
}
}
}
$bot = new robots();
$result=$bot->allowed("http://www.lfweb.dk/produkter");
var_dump($result);
?>
Refactorings
No refactoring yet !
Adam
November 25, 2010, November 25, 2010 10:07, permalink
You might want to separate the concerns of your class. It is of my opinion that the parser should parse and that the network operations should be handled elsewhere. I have provided an example parser class which returns the paths in which you are not allowed to crawl when supplied the robots.txt contents.
I cannot remember the last time I wrote any PHP code. This was a fun little exercise to try a different language. Thanks for sharing it.
<?php
class RobotsParser
{
const ROBOT_USER_AGENT_PATTERN = "/Mozilla/i";
const PROCESS_STATE_USER_AGENT = 0;
const PROCESS_STATE_USER_AGENT_MATCHED = 1;
const PROCESS_STATE_DISALLOW = 2;
function RobotsParser($robots_txt = "") {
$this->state = RobotsParser::PROCESS_STATE_USER_AGENT;
$this->disallowed_paths = array();
$this->parse($robots_txt);
}
function disallowed_paths()
{
return $this->disallowed_paths;
}
protected function parse($robots_txt)
{
if (!preg_match_all("/((User-agent|Disallow):\s*.*)\s*/i", $robots_txt, $matches))
throw new Exception("robots.txt was invalid.");
$this->process($matches[1]);
}
protected function process($lines)
{
foreach($lines as $line) {
preg_match("/(.+):\s*(.*)/", $line, $matches);
$this->process_line($matches[1], $matches[2]);
}
}
protected function process_line($key, $value)
{
switch ($this->state) {
case RobotsParser::PROCESS_STATE_USER_AGENT:
if (strtolower($key) == "user-agent" && $value == "*" ||
preg_match(RobotsParser::ROBOT_USER_AGENT_PATTERN, $value)) {
$this->state = RobotsParser::PROCESS_STATE_USER_AGENT_MATCHED;
}
break;
case RobotsParser::PROCESS_STATE_USER_AGENT_MATCHED:
if (strtolower($key) == "disallow") {
$this->state = RobotsParser::PROCESS_STATE_DISALLOW;
$this->proces_line($key, $value);
}
break;
case RobotsParser::PROCESS_STATE_DISALLOW:
if (strtolower($key) == "disallow") {
// The robots.txt spec states that an empty Disallow entry
// should undo any previously matched rules
if (empty($value)) {
$this->disallowed_paths = array();
} else {
array_push($this->disallowed_paths, $value);
}
} else {
$this->state = RobotsParser::PROCESS_STATE_USER_AGENT;
$this->process_line($key, $value);
}
break;
}
}
}
$parser = new RobotsParser($contents_of_robots_txt);
$parser->disallowed_paths();
?>
https://www.google.com/accounts/o8/id?id=AItOawklZdhdjqwQTw9BJpq1cfSII6qymuhSqR4
January 1, 2011, January 01, 2011 15:54, permalink
Thanks for your input. I will try to separate the contexts.
Sorry that i havent looked here for a month! :-(
Happy new year
Hi there,
I want to make a php-class which I can use to test wether I am allowed to read a webpage from a url or not.
I have borrowed a little code and made some of my own.
Will you comment on the code please? :-)