<?php
class RequestFile
{
public $StatusLogOn = FALSE; //turn logging on/off (TRUE/FALSE)
//tries different methods for retrieving a url as
//each hosting company tends to screw this up in one way or another
//it logs the successful one in the db for use next time
public function get_file($url)
{
//attempt the standard file_get_contents
$data = $this->cc_file_get_contents($url);
//if that didn't work try curl
if(!$data)
{
$data = $this->cc_curl_file($url);
}
//if that didn't work try a socket
if(!$data)
{
$data = $this->cc_socket_file($url);
}
//if all else fails we can try wget (assuming we're on linux)
if(!$data)
{
$data = $this->cc_wget_file($url);
}
return $data;
}
//minor modification to file_get_contents to log if it worked or not
//note this or (fopen) will not work if allow_url_fopen = 0 in php.ini
public function cc_file_get_contents($url)
{
if($data = file_get_contents($url))
{
$this->log_status(date("H:i:s d-m-Y").' SUCCESS: Data Has Been Downloaded Using file_get_contents');
return $data;
}
else
{
if (ini_get('allow_url_fopen') != '1')
{
$msg = "fopen wrappers are disabled";
}
$this->log_status(date("H:i:s d-m-Y").' FAIL: '.$msg);
return FALSE;
}
}
//try and use curl
public function cc_curl_file($url)
{
// make sure curl is installed
if (function_exists('curl_init'))
{
$ch = curl_init(); //initialize a new curl resource
curl_setopt($ch, CURLOPT_URL, $url); //set the url to fetch
curl_setopt($ch, CURLOPT_HEADER, 0); //don't give me the headers just the content
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //return the value instead of printing the response to browser
curl_setopt($ch, CURLOPT_USERAGENT, $this->rand_user_agent()); //use a user agent to mimic a browser
$data = curl_exec($ch); //run the command
curl_close($ch); //remember to always close the session and free all resources
$this->log_status(date("H:i:s d-m-Y").' SUCCESS: Data Has Been Downloaded Using CURL');
return $data;
}
else
{
$this->log_status(date("H:i:s d-m-Y").' FAIL: CURL is not installed');
return FALSE;
}
}
//try and use sockets - this code is untested
public function cc_socket_file($url)
{
$parsedUrl = parse_url($url); //get the host name and url path
$host = $parsedUrl['host'];
if(isset($parsedUrl['path']))
{
$path = $parsedUrl['path'];
}
else
{
$path = '/'; //the url is pointing to the host like http://www.mysite.com
}
if (isset($parsedUrl['query']))
{
$path .= '?'.$parsedUrl['query'];
}
if (isset($parsedUrl['port']))
{
$port = $parsedUrl['port'];
}
else
{
$port = '80'; //most sites use port 80
}
$timeout = 10;
$response = '';
$fp = @fsockopen($host, '80', $errno, $errstr, $timeout); //connect to the remote server
if(!$fp)
{
$this->log_status(date("H:i:s d-m-Y").' FAIL: Socket Failed Cannot Retrieve '.$url);
return FALSE;
}
else
{
//send the necessary headers to get the file
fputs($fp, "GET $path HTTP/1.0\r\n" .
"Host: $host\r\n" .
"User-Agent: ".$this->rand_user_agent()."\r\n" .
"Accept: */*\r\n" .
"Accept-Language: en-us,en;q=0.5\r\n" .
"Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\n" .
"Keep-Alive: 300\r\n" .
"Connection: keep-alive\r\n" .
"Referer: http://$host\r\n\r\n");
//retrieve the response from the remote server
while($line = fread($fp, 4096))
{
$response .= $line;
}
fclose($fp);
//strip the headers
$pos = strpos($response, "\r\n\r\n");
$response = substr($response, $pos + 4);
}
$this->log_status(date("H:i:s d-m-Y").' SUCCESS: Data Has Been Downloaded Via A Socket Connection');
return $response;
}
//try and use wget if on linux
public function cc_wget_file($url)
{
$cmd = "wget '".$url."'";
exec($cmd);
if($data = file_get_contents("broadcast")) //the file gets saved as file simply called broadcast
{
$this->log_status(date("H:i:s d-m-Y").' SUCCESS: File Downloaded Using wget');
return $data;
}
else
{
$this->log_status(date("H:i:s d-m-Y").' FAILED: Data Could not be retrieved using wget and file_get_contents');
return FALSE;
}
}
//select an random user agent from the db
public function rand_user_agent()
{
$sql = "SELECT * FROM cc_user_agents ORDER BY rand()";
//code to get a record from the database goes here
if($row === FALSE || !$row[0])
{
$this->log_status(date("H:i:s d-m-Y").' FAIL: Could Not Retrieve Random User Agent - Using Default');
return "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3";
}
else
{
return $row[0];
}
}
//log status to a file
public function log_status($msg)
{
if($this->StatusLogOn === TRUE)
{
$path = "RequestFile.status.log";
$data = file_get_contents($path);
$data .= "\n".$msg;
file_put_contents($data, $path);
}
return TRUE;
}
}
?>
Refactorings
No refactoring yet !
Ishkur
August 19, 2008, August 19, 2008 18:41, permalink
In your function 'public function cc_wget_file($url)', you're passing the string through exec(). Thats fine, but PHP allows you to pass cli strings via backticks (http://www.php.net/language.operators.execution). Might help, might not, ymmv.
Chris Dean
August 20, 2008, August 20, 2008 16:20, permalink
@Ishkur
I try to avoid back-ticks as (for me at least) they obfuscate the code a little bit.
That said, it may give a small performance boost - I don't know as I've not done any such tests, but maybe I will if I get 5 mins :)
bartz
January 13, 2009, January 13, 2009 16:49, permalink
I'd refactor all the "action" methods out in separate classes.
All those methods share similarity in variables they use and logging.
// untested, and my PHP is verrry rusty...
class CCAction
{
const FAIL = "FAIL";
const SUCCESS = "SUCCESS";
protected $url;
protected $method;
public function fetch()
{
if(!isset($this->url))
{
$this->log(CCAction::FAIL, "No url set")
}
if($this->check_prerequisites())
{
if($this->do_fetch())
{
$this->log(CCAction::SUCCESS, "File has been downloaded using ".$this->method);
}
else
{
$this->log(CCAction::FAIL, "Could not downloaded file using ".$this->method);
}
}
}
protected function log($status, $message)
{
$this->log_status(date("H:i:s d-m-Y")." $status : $message");
}
}
class CCGetUsingCurl extends CCAction
{
protected $method = "wget";
public function __init($url)
{
$this->url = $url;
}
protected function check_prerequisites()
{
// check for curl, etc
}
protected function do_fetch()
{
// do the actual fetching
}
}
After finding different hosting companies having wildly different policies when it comes to what's enabled and what isn't in PHP, I'm trying to build a generic file download class which will work in almost all situations.
What I'd like to know is if any of the code below can be made more efficient or robust and also if anyone has a function for achieving the equivalent wget call in windows.
I should just mention that this class has been cleaned of some application specific code, so if there's any disjointed bits I apologise - I have tried to deal with any discontinuities before posting but there may be something i've missed (this is also the reason for the bunch of if's in get_file() - the main version follows a few other processes/checks before attempting one of the download functions).
Thanks
Chris