<?
function robots_allowed($url){
$current_url=$url;
$xmp=explode("/", $current_url."/");
$robotsdomain=trim("http://".$xmp[2]);
$stipped_robotsdomain=str_replace("/","",$robotsdomain);
$stripped_current_url=str_replace("/", "" ,$url);
$my_user_agent="User-agent: intermap"; //my useragent
$robots=Read_Content($robotsdomain.'/robots.txt');
$robots=explode("\n",$robots);
for ($i=0;$i<sizeof($robots);$i++){
if (trim($robots[$i])==$my_user_agent){ // rules for agent: *
for ($checkrules=1;$checkrules<10;$checkrules++){
if (trim($robots[$i+$checkrules])!=""){
$pos = strpos( $current_line[$count],"User-agent");
if (is_integer($pos)) break;
$pos = strpos( $current_line[$count],"#");
if (is_integer($pos)) $current_line[$count]=substr($current_line[$count],0,$pos);
$disallow_line=str_replace("Disallow: ", "" ,$robots[$i+$checkrules]);
//$disallow_line=str_replace("http://", "" ,$disallow_line);
$disallow_line=str_replace("/", "" ,$disallow_line);
$newdata[$num]=$stipped_robotsdomain.$disallow_line;
$num++;
$count++;
}
}
}
}
$forbidden=1;
for ($last=0;$last<20;$last++){
if (trim($newdata[$last])!=""){
if (preg_match("/".trim($newdata[$last])."/i",$stripped_current_url)) {$forbidden=0;}
}
}
return $forbidden;
}
function Read_Content($url){// Open een url return content
$handle=@fopen($url,"r");
if($handle){
$contents = fread ($handle, 10000);
fclose($handle);
}
return $contents;
}
?>
Refactorings
No refactoring yet !
typefreak
October 28, 2007, October 28, 2007 16:12, permalink
I'm not currently refactoring, but I have a few comments:
1: You don't check for user agent * (Only for $my_user_agent)
2: You don't check for allow lines (sometimes a exception for disallowed pages is given in 'allow: ' lines)
3: At the end of the main function, you're using $forbidden a bit strange: (You want a boolean answer, so use true/false. And in this case, as the function is robots_allowed(), I would rather call the variable $allowed instead of $forbidden.)
4: Why is this line?
$disallow_line=str_replace("/", "" ,$disallow_line);
What if a site has
Disallow: /info/secret in its list?
Currently, you'r checking if the requested url contains infosecret, instead of info/secret
5: (related to 4), When checking url's, It isn't wise to use '/' as the delimiter, as the url itself can contain these caracters. Better use # instead.
6: In Read_Content(), if fopen fails, you'll probably get a notice at the return, because $contents isn't set. (Please, don't suppress, but solve)
Marco Valtas
October 30, 2007, October 30, 2007 05:20, permalink
Hi, when I saw your code I thought that a class Robot could be useful, the nicest thing was that I found a Perl module (WWW::RobotRules) that do exactly what your code propose but in a OO way.
What I did was translate the Perl module to PHP. You can tweak around to see if help on your problem. I'm not a professional PHP programmer so maybe some specific optimization can be done.
Some caveats: the regular expression engine PCRE does not allow repeat quantifiers on lookahead assertions. (see: http://www.php.net/manual/en/reference.pcre.pattern.syntax.php) but in the original Perl module were some, actually one, I don't think will differ but keep it in mind (in the function useragent()). I've could not test this code enough so care should be taken.
As you probably will notice I didn't translated all functionality of the original module, there's no time keeping and one object WWW_Robot should e used for only one domain.
Hope this helps.
<?
class WWW_Robot {
var $url;
var $useragent = "*";
// array which we mark the disallowed paths
var $rules = null;
/* Should find and parse the robots file,
* cache the result for ->allowed() subsequent calls.
* If the file could not be found ->allowed() should
* return TRUE for any call.
*/
function parseURL($url_given) {
// boolean flags...
$is_me = false;
$is_anon = false;
$me_disallowed = null;
$anon_disallowed = null;
$this->url = parse_url($url_given);
$robot_file_data = $this->retrieve_robot_file($this->url);
if(! isset($robot_file_data) ) { // robots.txt not exists
}
else { // robots.txt file exists
foreach(explode("\n",$robot_file_data) as $line) {
$line = preg_replace("/\015$/", "", $line); // removing CRs if exists.
if(preg_match("/\s*\#/", $line)) continue; // skipping comments.
$line = preg_replace("/\s*\#.*/", "", $line); // removing comments at end of a line.
if(preg_match("/^\s*$/", $line)) {
if($is_me) break;
$is_anon = false;
}
elseif(preg_match("/^User-Agent:\s*(.*)/i", $line, $found)) {
$ua = preg_replace("/\s+$/", "", $found[1]); // removing tralling space.
if($is_me) {
}
elseif( $ua == '*' ) {
$is_anon = true;
}
elseif($this->match_with_me($ua)) {
$is_me = true;
}
}
elseif(preg_match("/^Disallow:\s*(.*)/i", $line, $found)) {
if(!isset($ua)) $is_anon = true; // disalow w/o previous UA, assuming *
$disallow = strtolower(preg_replace("/\s+$/", "", $found[1]));
if($is_me) {
$me_disallowed[] = $disallow;
}
elseif($is_anon) {
$anon_disallowed[] = $disallow;
}
}
else {
/* Google, and probably others, uses a Allow in robots.txt, this is probably a extenssion
* of the robots.txt syntax, we do not support these.
* If want to to see warnings about these lines uncomment the
* code below.
*/
//trigger_error("Strange line in robots file: $line", E_USER_WARNING);
}
}// end foreach()
if($is_me) {
$this->rules = $me_disallowed;
}
else {
$this->rules = $anon_disallowed;
}
}// end else robots.txt file exsits.
}// end parseURL()
function match_with_me($ua) {
if(strtolower($this->useragent) == strtolower($ua)) {
return true;
}
else {
return false;
}
}
function retrieve_robot_file($from_url) {
$robot_file = @file_get_contents($from_url['scheme'].'://'.$from_url['host'].'/robots.txt');
return $robot_file;
}
/*
* This method returns true if our agent has permission
* to enter (crawl) the PATH argument.
*/
function allowed($path) {
if(!isset($this->rules)) return true;
foreach($this->rules as $rule) {
$strcmp_result = strcmp($rule, strtolower($path));
$pos;
if($strcmp_result == 0) {
return false; // we have a match
}
elseif($strcmp_result < 0) {
$pos = strpos($path, $rule, 0);
}
else {
$pos = strpos($rule, $path, 0);
}
if($pos === 0) return false;
}
return true; // if we could not find a rule to disallow
}
// get/set for useragent...
function useragent($ua = null) {
if(isset($ua)) {
$this->me_disallowed = null; // cleaning data
$this->anon_disallowed = null; // cleaning data
$this->useragent = preg_replace("!/\s*\d+.\d+\s*$!", "", $ua); // original re: s!/?\s*\d+.\d+\s*$!!
}
return $this->useragent; // to inform our current useragent.
}
} //end class
?>
<?
// test code...
$robot = new WWW_Robot;
$robot->useragent("Some UserAgent");
$robot->parseURL("http://www.google.com.br");
// can we crawl this dir in google?
echo "->".$robot->allowed("/defauts/")."<-\n";
// can we crawl this dir in google?
echo "->".$robot->allowed("/trends/")."<-\n";
?>
Deepak Pradhan
December 8, 2009, December 08, 2009 17:43, permalink
One more way is here
Usage:
$Robots = new Robots;
$Robots->domain ='www.microsoft.com';
$Robots->read_robots_file();
$Robots->getRules();
print_r($Robots->rules);
echo $Robots->chkAccess('http://microsoft.com');
echo $Robots->chkAccess('http://microsoft.com/uk/mnp_utility.mspx?eee=4444');
<?
/*
Ref: http://www.robotstxt.org/orig.html
-------------------------------------------------------------------------------
*/
class Robots {
var $domain ='';
var $lines = array();
var $rules = array();
var $chkUrl = '';
function read_robots_file() {
//robots file must be in home dir
$RobotFile = 'http://'.$this->domain.'/robots.txt';
$lines = @file($RobotFile);
if(!$lines) { // robots file NOT FOUND
return false;
}
// Strip blank lines and remove comments
foreach ($lines as $line) {
// Chunk off imbeded comment, ignore temp - text after comment mark
if (strpos($line,'#')!==false) {
list($line,$temp) = explode('#', $line);
}
// Each record contains lines of the form
// "<field>:<optionalspace><value><optionalspace>".
$line = preg_replace('/\s*/m', '', $line); //replace all spaces , tabs, NL
list($field, $value) = explode(':', $line, 2);
if (!$field) { // skip empty lines
continue;
} elseif (substr($field,0,1)=='#') { // skip comment lines
continue;
} else { // remember current agent
$this->lines[] = $line;
}
}
}
function getRules() {
// Loop over each line of the file
foreach ($this->lines as $line) {
list($field, $value) = explode(':', $line, 2);
if (strtolower($field)=='user-agent') { // remember current agent
$CurrAgent = ($value=='*')?'ALL':$value;
}
else { //parse the value
$item = new stdClass;
$item->value = preg_quote(trim($value), '/');
$url = 'http://'.$this->domain.$value;
$URIs = parse_url($url);
//Determine path, file & ext
if ($URIs['path']) {
$folders = explode('/', $URIs['path']);
if ($folders[count($folders)-1]) {
$item->file = $folders[count($folders)-1];
$item->path = str_replace($item->file, '', $URIs['path']);
}
}
if (substr($value,-1) == '?') {
$item->query='*';
} elseif ($URIs['query']) {
$item->query=$URIs['query'];
}
if (strtolower($field) == "sitemap") {
$item->sitemap = 1;
$Robots['Sitemap']['sitemap'][]=$item;
} else {
$Robots[$CurrAgent][$field][]=$item;
}
}
}
$this->rules = $Robots;
}
function chkAccess($url) {
$parsed = parse_url($url);
foreach ($this->rules['ALL']['Disallow'] as $item) {
$rule = $item->value;
if(preg_match("/^$rule/", $parsed['path'])) return false;
}
return true;
}
}
?>
Deepak Pradhan
December 8, 2009, December 08, 2009 18:06, permalink
modified
<?
/*
Ref: http://www.robotstxt.org/orig.html
-------------------------------------------------------------------------------
*/
class Robots {
var $domain ='';
var $lines = array();
var $rules = array();
var $chkUrl = '';
function read_robots_file() {
//robots file must be in home dir
$RobotFile = 'http://'.$this->domain.'/robots.txt';
$lines = @file($RobotFile);
if(!$lines) { // robots file NOT FOUND
return false;
}
// Strip blank lines and remove comments
foreach ($lines as $line) {
// Chunk off imbeded comment, ignore temp - text after comment mark
if (strpos($line,'#')!==false) {
list($line,$temp) = explode('#', $line);
}
// Each record contains lines of the form
// "<field>:<optionalspace><value><optionalspace>".
$line = preg_replace('/\s*/m', '', $line); //replace all spaces , tabs, NL
list($field, $value) = explode(':', $line, 2);
if (!$field) { // skip empty lines
continue;
} elseif (substr($field,0,1)=='#') { // skip comment lines
continue;
} else { // remember current agent
$this->lines[] = $line;
}
}
}
function getRules() {
// Loop over each line of the file
foreach ($this->lines as $line) {
list($field, $value) = explode(':', $line, 2);
if (strtolower($field)=='user-agent') { // remember current agent
$CurrAgent = ($value=='*')?'ALL':$value;
}
else { //parse the value
if (strtolower($field) == "sitemap") {
$Robots['Sitemap']['sitemap'][]=$value;
} else {
$Robots[$CurrAgent][$field][]=preg_quote(trim($value), '/');
}
}
}
$this->rules = $Robots;
}
function chkAccess($url,$Agent='ALL') {
$parsed = parse_url($url);
if (is_array($this->rules[$Agent]['Disallow'])) {
foreach ($this->rules[$Agent]['Disallow'] as $i=> $rule) {
if(preg_match("/^$rule/", $parsed['path'])) return 'false';
}
}
if (is_array($this->rules[$Agent]['Allow'])) {
foreach ($this->rules[$Agent]['Allow'] as $i=> $rule) {
if(preg_match("/^$rule/", $parsed['path'])) return 'true';
}
}
return 'true';
}
}
?>
This code will refer to the robots.txt file for a website and return a boolean value on whether or not to spider that particular page.