1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
<?php class RequestFile { public $StatusLogOn = FALSE; //turn logging on/off (TRUE/FALSE) //tries different methods for retrieving a url as //each hosting company tends to screw this up in one way or another //it logs the successful one in the db for use next time public function get_file($url) { //attempt the standard file_get_contents $data = $this->cc_file_get_contents($url); //if that didn't work try curl if(!$data) { $data = $this->cc_curl_file($url); } //if that didn't work try a socket if(!$data) { $data = $this->cc_socket_file($url); } //if all else fails we can try wget (assuming we're on linux) if(!$data) { $data = $this->cc_wget_file($url); } return $data; } //minor modification to file_get_contents to log if it worked or not //note this or (fopen) will not work if allow_url_fopen = 0 in php.ini public function cc_file_get_contents($url) { if($data = file_get_contents($url)) { $this->log_status(date("H:i:s d-m-Y").' SUCCESS: Data Has Been Downloaded Using file_get_contents'); return $data; } else { if (ini_get('allow_url_fopen') != '1') { $msg = "fopen wrappers are disabled"; } $this->log_status(date("H:i:s d-m-Y").' FAIL: '.$msg); return FALSE; } } //try and use curl public function cc_curl_file($url) { // make sure curl is installed if (function_exists('curl_init')) { $ch = curl_init(); //initialize a new curl resource curl_setopt($ch, CURLOPT_URL, $url); //set the url to fetch curl_setopt($ch, CURLOPT_HEADER, 0); //don't give me the headers just the content curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //return the value instead of printing the response to browser curl_setopt($ch, CURLOPT_USERAGENT, $this->rand_user_agent()); //use a user agent to mimic a browser $data = curl_exec($ch); //run the command curl_close($ch); //remember to always close the session and free all resources $this->log_status(date("H:i:s d-m-Y").' SUCCESS: Data Has Been Downloaded Using CURL'); return $data; } else { $this->log_status(date("H:i:s d-m-Y").' FAIL: CURL is not installed'); return FALSE; } } //try and use sockets - this code is untested public function cc_socket_file($url) { $parsedUrl = parse_url($url); //get the host name and url path $host = $parsedUrl['host']; if(isset($parsedUrl['path'])) { $path = $parsedUrl['path']; } else { $path = '/'; //the url is pointing to the host like http://www.mysite.com } if (isset($parsedUrl['query'])) { $path .= '?'.$parsedUrl['query']; } if (isset($parsedUrl['port'])) { $port = $parsedUrl['port']; } else { $port = '80'; //most sites use port 80 } $timeout = 10; $response = ''; $fp = @fsockopen($host, '80', $errno, $errstr, $timeout); //connect to the remote server if(!$fp) { $this->log_status(date("H:i:s d-m-Y").' FAIL: Socket Failed Cannot Retrieve '.$url); return FALSE; } else { //send the necessary headers to get the file fputs($fp, "GET $path HTTP/1.0\r\n" . "Host: $host\r\n" . "User-Agent: ".$this->rand_user_agent()."\r\n" . "Accept: */*\r\n" . "Accept-Language: en-us,en;q=0.5\r\n" . "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\n" . "Keep-Alive: 300\r\n" . "Connection: keep-alive\r\n" . "Referer: http://$host\r\n\r\n"); //retrieve the response from the remote server while($line = fread($fp, 4096)) { $response .= $line; } fclose($fp); //strip the headers $pos = strpos($response, "\r\n\r\n"); $response = substr($response, $pos + 4); } $this->log_status(date("H:i:s d-m-Y").' SUCCESS: Data Has Been Downloaded Via A Socket Connection'); return $response; } //try and use wget if on linux public function cc_wget_file($url) { $cmd = "wget '".$url."'"; exec($cmd); if($data = file_get_contents("broadcast")) //the file gets saved as file simply called broadcast { $this->log_status(date("H:i:s d-m-Y").' SUCCESS: File Downloaded Using wget'); return $data; } else { $this->log_status(date("H:i:s d-m-Y").' FAILED: Data Could not be retrieved using wget and file_get_contents'); return FALSE; } } //select an random user agent from the db public function rand_user_agent() { $sql = "SELECT * FROM cc_user_agents ORDER BY rand()"; //code to get a record from the database goes here if($row === FALSE || !$row[0]) { $this->log_status(date("H:i:s d-m-Y").' FAIL: Could Not Retrieve Random User Agent - Using Default'); return "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3"; } else { return $row[0]; } } //log status to a file public function log_status($msg) { if($this->StatusLogOn === TRUE) { $path = "RequestFile.status.log"; $data = file_get_contents($path); $data .= "\n".$msg; file_put_contents($data, $path); } return TRUE; } } ?>
Refactorings
No refactoring yet !
Ishkur
August 19, 2008, August 19, 2008 18:41, permalink
In your function 'public function cc_wget_file($url)', you're passing the string through exec(). Thats fine, but PHP allows you to pass cli strings via backticks (http://www.php.net/language.operators.execution). Might help, might not, ymmv.
Chris Dean
August 20, 2008, August 20, 2008 16:20, permalink
@Ishkur
I try to avoid back-ticks as (for me at least) they obfuscate the code a little bit.
That said, it may give a small performance boost - I don't know as I've not done any such tests, but maybe I will if I get 5 mins :)
After finding different hosting companies having wildly different policies when it comes to what's enabled and what isn't in PHP, I'm trying to build a generic file download class which will work in almost all situations.
What I'd like to know is if any of the code below can be made more efficient or robust and also if anyone has a function for achieving the equivalent wget call in windows.
I should just mention that this class has been cleaned of some application specific code, so if there's any disjointed bits I apologise - I have tried to deal with any discontinuities before posting but there may be something i've missed (this is also the reason for the bunch of if's in get_file() - the main version follows a few other processes/checks before attempting one of the download functions).
Thanks
Chris