mirror of
https://github.com/wallabag/wallabag.git
synced 2025-08-01 17:38:38 +00:00
[change] we now use Full-Text RSS 3.1, thank you so much @fivefilters
This commit is contained in:
parent
59cc585271
commit
42c80841c8
83 changed files with 23898 additions and 7845 deletions
404
inc/3rdparty/libraries/humble-http-agent/CookieJar.php
vendored
Normal file
404
inc/3rdparty/libraries/humble-http-agent/CookieJar.php
vendored
Normal file
|
@ -0,0 +1,404 @@
|
|||
<?php
|
||||
/**
|
||||
* Cookie Jar
|
||||
*
|
||||
* PHP class for handling cookies, as defined by the Netscape spec:
|
||||
* <http://curl.haxx.se/rfc/cookie_spec.html>
|
||||
*
|
||||
* This class should be used to handle cookies (storing cookies from HTTP response messages, and
|
||||
* sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org
|
||||
* from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/
|
||||
*
|
||||
* This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/
|
||||
* lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>.
|
||||
* Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965.
|
||||
*
|
||||
* @version 0.5
|
||||
* @date 2011-03-15
|
||||
* @see http://php.net/HttpRequestPool
|
||||
* @author Keyvan Minoukadeh
|
||||
* @copyright 2011 Keyvan Minoukadeh
|
||||
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
|
||||
*/
|
||||
|
||||
class CookieJar
|
||||
{
|
||||
/**
|
||||
* Cookies - array containing all cookies.
|
||||
*
|
||||
* <pre>
|
||||
* Cookies are stored like this:
|
||||
* [domain][path][name] = array
|
||||
* where array is:
|
||||
* 0 => value, 1 => secure, 2 => expires
|
||||
* </pre>
|
||||
* @var array
|
||||
* @access private
|
||||
*/
|
||||
public $cookies = array();
|
||||
public $debug = false;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*/
|
||||
function __construct() {
|
||||
}
|
||||
|
||||
protected function debug($msg, $file=null, $line=null) {
|
||||
if ($this->debug) {
|
||||
$mem = round(memory_get_usage()/1024, 2);
|
||||
$memPeak = round(memory_get_peak_usage()/1024, 2);
|
||||
echo '* ',$msg;
|
||||
if (isset($file, $line)) echo " ($file line $line)";
|
||||
echo ' - mem used: ',$mem," (peak: $memPeak)\n";
|
||||
ob_flush();
|
||||
flush();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get matching cookies
|
||||
*
|
||||
* Only use this method if you cannot use add_cookie_header(), for example, if you want to use
|
||||
* this cookie jar class without using the request class.
|
||||
*
|
||||
* @param array $param associative array containing 'domain', 'path', 'secure' keys
|
||||
* @return string
|
||||
* @see add_cookie_header()
|
||||
*/
|
||||
public function getMatchingCookies($url)
|
||||
{
|
||||
if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) {
|
||||
$param['domain'] = $parts['host'];
|
||||
$param['path'] = $parts['path'];
|
||||
$param['secure'] = (strtolower($parts['scheme']) == 'https');
|
||||
unset($parts);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
// RFC 2965 notes:
|
||||
// If multiple cookies satisfy the criteria above, they are ordered in
|
||||
// the Cookie header such that those with more specific Path attributes
|
||||
// precede those with less specific. Ordering with respect to other
|
||||
// attributes (e.g., Domain) is unspecified.
|
||||
$domain = $param['domain'];
|
||||
if (strpos($domain, '.') === false) $domain .= '.local';
|
||||
$request_path = $param['path'];
|
||||
if ($request_path == '') $request_path = '/';
|
||||
$request_secure = $param['secure'];
|
||||
$now = time();
|
||||
$matched_cookies = array();
|
||||
// domain - find matching domains
|
||||
$this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__);
|
||||
while (strpos($domain, '.') !== false) {
|
||||
if (isset($this->cookies[$domain])) {
|
||||
$this->debug(' domain match found: '.$domain);
|
||||
$cookies =& $this->cookies[$domain];
|
||||
} else {
|
||||
$domain = $this->_reduce_domain($domain);
|
||||
continue;
|
||||
}
|
||||
// paths - find matching paths starting from most specific
|
||||
$this->debug(' - Finding matching paths for '.$request_path);
|
||||
$paths = array_keys($cookies);
|
||||
usort($paths, array($this, '_cmp_length'));
|
||||
foreach ($paths as $path) {
|
||||
// continue to next cookie if request path does not path-match cookie path
|
||||
if (!$this->_path_match($request_path, $path)) continue;
|
||||
// loop through cookie names
|
||||
$this->debug(' path match found: '.$path);
|
||||
foreach ($cookies[$path] as $name => $values) {
|
||||
// if this cookie is secure but request isn't, continue to next cookie
|
||||
if ($values[1] && !$request_secure) continue;
|
||||
// if cookie is not a session cookie and has expired, continue to next cookie
|
||||
if (is_int($values[2]) && ($values[2] < $now)) continue;
|
||||
// cookie matches request
|
||||
$this->debug(' cookie match: '.$name.'='.$values[0]);
|
||||
$matched_cookies[] = $name.'='.$values[0];
|
||||
}
|
||||
}
|
||||
$domain = $this->_reduce_domain($domain);
|
||||
}
|
||||
// return cookies
|
||||
return implode('; ', $matched_cookies);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse Set-Cookie values.
|
||||
*
|
||||
* Only use this method if you cannot use extract_cookies(), for example, if you want to use
|
||||
* this cookie jar class without using the response class.
|
||||
*
|
||||
* @param array $set_cookies array holding 1 or more "Set-Cookie" header values
|
||||
* @param array $param associative array containing 'host', 'path' keys
|
||||
* @return void
|
||||
* @see extract_cookies()
|
||||
*/
|
||||
public function storeCookies($url, $set_cookies)
|
||||
{
|
||||
if (count($set_cookies) == 0) return;
|
||||
$param = @parse_url($url);
|
||||
if (!is_array($param) || !isset($param['host'])) return;
|
||||
$request_host = $param['host'];
|
||||
if (strpos($request_host, '.') === false) $request_host .= '.local';
|
||||
$request_path = @$param['path'];
|
||||
if ($request_path == '') $request_path = '/';
|
||||
//
|
||||
// loop through set-cookie headers
|
||||
//
|
||||
foreach ($set_cookies as $set_cookie) {
|
||||
$this->debug('Parsing: '.$set_cookie);
|
||||
// temporary cookie store (before adding to jar)
|
||||
$tmp_cookie = array();
|
||||
$param = explode(';', $set_cookie);
|
||||
// loop through params
|
||||
for ($x=0; $x<count($param); $x++) {
|
||||
$key_val = explode('=', $param[$x], 2);
|
||||
if (count($key_val) != 2) {
|
||||
// if the first param isn't a name=value pair, continue to the next set-cookie
|
||||
// header
|
||||
if ($x == 0) continue 2;
|
||||
// check for secure flag
|
||||
if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true;
|
||||
// continue to next param
|
||||
continue;
|
||||
}
|
||||
list($key, $val) = array_map('trim', $key_val);
|
||||
// first name=value pair is the cookie name and value
|
||||
// the name and value are stored under 'name' and 'value' to avoid conflicts
|
||||
// with later parameters.
|
||||
if ($x == 0) {
|
||||
$tmp_cookie = array('name'=>$key, 'value'=>$val);
|
||||
continue;
|
||||
}
|
||||
$key = strtolower($key);
|
||||
if (in_array($key, array('expires', 'path', 'domain', 'secure'))) {
|
||||
$tmp_cookie[$key] = $val;
|
||||
}
|
||||
}
|
||||
//
|
||||
// set cookie
|
||||
//
|
||||
// check domain
|
||||
if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) &&
|
||||
($tmp_cookie['domain'] != ".$request_host")) {
|
||||
$domain = $tmp_cookie['domain'];
|
||||
if ((strpos($domain, '.') === false) && ($domain != 'local')) {
|
||||
$this->debug(' - domain "'.$domain.'" has no dot and is not a local domain');
|
||||
continue;
|
||||
}
|
||||
if (preg_match('/\.[0-9]+$/', $domain)) {
|
||||
$this->debug(' - domain "'.$domain.'" appears to be an ip address');
|
||||
continue;
|
||||
}
|
||||
if (substr($domain, 0, 1) != '.') $domain = ".$domain";
|
||||
if (!$this->_domain_match($request_host, $domain)) {
|
||||
$this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"');
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// if domain is not specified in the set-cookie header, domain will default to
|
||||
// the request host
|
||||
$domain = $request_host;
|
||||
}
|
||||
// check path
|
||||
if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) {
|
||||
$path = urldecode($tmp_cookie['path']);
|
||||
if (!$this->_path_match($request_path, $path)) {
|
||||
$this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"');
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
$path = $request_path;
|
||||
$path = substr($path, 0, strrpos($path, '/'));
|
||||
if ($path == '') $path = '/';
|
||||
}
|
||||
// check if secure
|
||||
$secure = (isset($tmp_cookie['secure'])) ? true : false;
|
||||
// check expiry
|
||||
if (isset($tmp_cookie['expires'])) {
|
||||
if (($expires = strtotime($tmp_cookie['expires'])) < 0) {
|
||||
$expires = null;
|
||||
}
|
||||
} else {
|
||||
$expires = null;
|
||||
}
|
||||
// set cookie
|
||||
$this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires);
|
||||
}
|
||||
}
|
||||
|
||||
// return array of set-cookie values extracted from HTTP response headers (string $h)
|
||||
public function extractCookies($h) {
|
||||
$x = 0;
|
||||
$lines = 0;
|
||||
$headers = array();
|
||||
$last_match = false;
|
||||
$h = explode("\n", $h);
|
||||
foreach ($h as $line) {
|
||||
$line = rtrim($line);
|
||||
$lines++;
|
||||
|
||||
$trimmed_line = trim($line);
|
||||
if (isset($line_last)) {
|
||||
// check if we have \r\n\r\n (indicating the end of headers)
|
||||
// some servers will not use CRLF (\r\n), so we make CR (\r) optional.
|
||||
// if (preg_match('/\015?\012\015?\012/', $line_last.$line)) {
|
||||
// break;
|
||||
// }
|
||||
// As an alternative, we can check if the current trimmed line is empty
|
||||
if ($trimmed_line == '') {
|
||||
break;
|
||||
}
|
||||
|
||||
// check for continuation line...
|
||||
// RFC 2616 Section 2.2 "Basic Rules":
|
||||
// HTTP/1.1 header field values can be folded onto multiple lines if the
|
||||
// continuation line begins with a space or horizontal tab. All linear
|
||||
// white space, including folding, has the same semantics as SP. A
|
||||
// recipient MAY replace any linear white space with a single SP before
|
||||
// interpreting the field value or forwarding the message downstream.
|
||||
if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) {
|
||||
// append to previous header value
|
||||
$headers[$x-1] .= ' '.rtrim($match[1]);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
$line_last = $line;
|
||||
|
||||
// split header name and value
|
||||
if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) {
|
||||
$headers[$x++] = rtrim($match[1]);
|
||||
$last_match = true;
|
||||
} else {
|
||||
$last_match = false;
|
||||
}
|
||||
}
|
||||
return $headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set Cookie
|
||||
* @param string $domain
|
||||
* @param string $path
|
||||
* @param string $name cookie name
|
||||
* @param string $value cookie value
|
||||
* @param bool $secure
|
||||
* @param int $expires expiry time (null if session cookie, <= 0 will delete cookie)
|
||||
* @return void
|
||||
*/
|
||||
function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null)
|
||||
{
|
||||
if ($domain == '') return;
|
||||
if ($path == '') return;
|
||||
if ($name == '') return;
|
||||
// check if cookie needs to go
|
||||
if (isset($expires) && ($expires <= 0)) {
|
||||
if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
|
||||
return;
|
||||
}
|
||||
if ($value == '') return;
|
||||
$this->cookies[$domain][$path][$name] = array($value, $secure, $expires);
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies.
|
||||
* @param string $domain
|
||||
* @param string $path
|
||||
* @param string $name
|
||||
* @return void
|
||||
*/
|
||||
function clear($domain=null, $path=null, $name=null)
|
||||
{
|
||||
if (!isset($domain)) {
|
||||
$this->cookies = array();
|
||||
} elseif (!isset($path)) {
|
||||
if (isset($this->cookies[$domain])) unset($this->cookies[$domain]);
|
||||
} elseif (!isset($name)) {
|
||||
if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]);
|
||||
} elseif (isset($name)) {
|
||||
if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare string length - used for sorting
|
||||
* @access private
|
||||
* @return int
|
||||
*/
|
||||
function _cmp_length($a, $b)
|
||||
{
|
||||
$la = strlen($a); $lb = strlen($b);
|
||||
if ($la == $lb) return 0;
|
||||
return ($la > $lb) ? -1 : 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduce domain
|
||||
* @param string $domain
|
||||
* @return string
|
||||
* @access private
|
||||
*/
|
||||
function _reduce_domain($domain)
|
||||
{
|
||||
if ($domain == '') return '';
|
||||
if (substr($domain, 0, 1) == '.') return substr($domain, 1);
|
||||
return substr($domain, strpos($domain, '.'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Path match - check if path1 path-matches path2
|
||||
*
|
||||
* From RFC 2965:
|
||||
* <i>For two strings that represent paths, P1 and P2, P1 path-matches P2
|
||||
* if P2 is a prefix of P1 (including the case where P1 and P2 string-
|
||||
* compare equal). Thus, the string /tec/waldo path-matches /tec.</i>
|
||||
* @param string $path1
|
||||
* @param string $path2
|
||||
* @return bool
|
||||
* @access private
|
||||
*/
|
||||
function _path_match($path1, $path2)
|
||||
{
|
||||
return (substr($path1, 0, strlen($path2)) == $path2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Domain match - check if domain1 domain-matches domain2
|
||||
*
|
||||
* A few extracts from RFC 2965:
|
||||
* - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com
|
||||
* would be rejected, because H is y.x and contains a dot.
|
||||
*
|
||||
* - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com
|
||||
* would be accepted.
|
||||
*
|
||||
* - A Set-Cookie2 with Domain=.com or Domain=.com., will always be
|
||||
* rejected, because there is no embedded dot.
|
||||
*
|
||||
* - A Set-Cookie2 from request-host example for Domain=.local will
|
||||
* be accepted, because the effective host name for the request-
|
||||
* host is example.local, and example.local domain-matches .local.
|
||||
*
|
||||
* I'm ignoring the first point for now (must check to see how other browsers handle
|
||||
* this rule for Set-Cookie headers)
|
||||
*
|
||||
* @param string $domain1
|
||||
* @param string $domain2
|
||||
* @return bool
|
||||
* @access private
|
||||
*/
|
||||
function _domain_match($domain1, $domain2)
|
||||
{
|
||||
$domain1 = strtolower($domain1);
|
||||
$domain2 = strtolower($domain2);
|
||||
while (strpos($domain1, '.') !== false) {
|
||||
if ($domain1 == $domain2) return true;
|
||||
$domain1 = $this->_reduce_domain($domain1);
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
?>
|
779
inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
vendored
Normal file
779
inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
vendored
Normal file
|
@ -0,0 +1,779 @@
|
|||
<?php
|
||||
/**
|
||||
* Humble HTTP Agent
|
||||
*
|
||||
* This class is designed to take advantage of parallel HTTP requests
|
||||
* offered by PHP's PECL HTTP extension or the curl_multi_* functions.
|
||||
* For environments which do not have these options, it reverts to standard sequential
|
||||
* requests (using file_get_contents())
|
||||
*
|
||||
* @version 1.1
|
||||
* @date 2012-08-20
|
||||
* @see http://php.net/HttpRequestPool
|
||||
* @author Keyvan Minoukadeh
|
||||
* @copyright 2011-2012 Keyvan Minoukadeh
|
||||
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
|
||||
*/
|
||||
|
||||
class HumbleHttpAgent
|
||||
{
|
||||
const METHOD_REQUEST_POOL = 1;
|
||||
const METHOD_CURL_MULTI = 2;
|
||||
const METHOD_FILE_GET_CONTENTS = 4;
|
||||
//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
|
||||
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
|
||||
const UA_PHP = 'PHP/5.2';
|
||||
const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
|
||||
|
||||
protected $requests = array();
|
||||
protected $redirectQueue = array();
|
||||
protected $requestOptions;
|
||||
protected $maxParallelRequests = 5;
|
||||
protected $cache = null; //TODO
|
||||
protected $httpContext;
|
||||
protected $minimiseMemoryUse = false; //TODO
|
||||
protected $method;
|
||||
protected $cookieJar;
|
||||
public $debug = false;
|
||||
public $debugVerbose = false;
|
||||
public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
|
||||
public $maxRedirects = 5;
|
||||
public $userAgentMap = array();
|
||||
public $rewriteUrls = array();
|
||||
public $userAgentDefault;
|
||||
public $referer;
|
||||
//public $userAgent = 'Mozilla/5.0';
|
||||
|
||||
// Prevent certain file/mime types
|
||||
// HTTP responses which match these content types will
|
||||
// be returned without body.
|
||||
public $headerOnlyTypes = array();
|
||||
// URLs ending with one of these extensions will
|
||||
// prompt Humble HTTP Agent to send a HEAD request first
|
||||
// to see if returned content type matches $headerOnlyTypes.
|
||||
public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');
|
||||
// AJAX triggers to search for.
|
||||
// for AJAX sites, e.g. Blogger with its dynamic views templates.
|
||||
public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');
|
||||
|
||||
//TODO: set max file size
|
||||
//TODO: normalise headers
|
||||
|
||||
function __construct($requestOptions=null, $method=null) {
|
||||
$this->userAgentDefault = self::UA_BROWSER;
|
||||
$this->referer = self::REF_GOOGLE;
|
||||
// set the request method
|
||||
if (in_array($method, array(1,2,4))) {
|
||||
$this->method = $method;
|
||||
} else {
|
||||
if (class_exists('HttpRequestPool')) {
|
||||
$this->method = self::METHOD_REQUEST_POOL;
|
||||
} elseif (function_exists('curl_multi_init')) {
|
||||
$this->method = self::METHOD_CURL_MULTI;
|
||||
} else {
|
||||
$this->method = self::METHOD_FILE_GET_CONTENTS;
|
||||
}
|
||||
}
|
||||
if ($this->method == self::METHOD_CURL_MULTI) {
|
||||
require_once(dirname(__FILE__).'/RollingCurl.php');
|
||||
}
|
||||
// create cookie jar
|
||||
$this->cookieJar = new CookieJar();
|
||||
// set request options (redirect must be 0)
|
||||
$this->requestOptions = array(
|
||||
'timeout' => 15,
|
||||
'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
|
||||
// TODO: test onprogress?
|
||||
);
|
||||
if (is_array($requestOptions)) {
|
||||
$this->requestOptions = array_merge($this->requestOptions, $requestOptions);
|
||||
}
|
||||
$this->httpContext = array(
|
||||
'http' => array(
|
||||
'ignore_errors' => true,
|
||||
'timeout' => $this->requestOptions['timeout'],
|
||||
'max_redirects' => $this->requestOptions['redirect'],
|
||||
'header' => "Accept: */*\r\n"
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
protected function debug($msg) {
|
||||
if ($this->debug) {
|
||||
$mem = round(memory_get_usage()/1024, 2);
|
||||
$memPeak = round(memory_get_peak_usage()/1024, 2);
|
||||
echo '* ',$msg;
|
||||
if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
|
||||
echo "\n";
|
||||
ob_flush();
|
||||
flush();
|
||||
}
|
||||
}
|
||||
|
||||
protected function getUserAgent($url, $asArray=false) {
|
||||
$host = @parse_url($url, PHP_URL_HOST);
|
||||
if (strtolower(substr($host, 0, 4)) == 'www.') {
|
||||
$host = substr($host, 4);
|
||||
}
|
||||
if ($host) {
|
||||
$try = array($host);
|
||||
$split = explode('.', $host);
|
||||
if (count($split) > 1) {
|
||||
array_shift($split);
|
||||
$try[] = '.'.implode('.', $split);
|
||||
}
|
||||
foreach ($try as $h) {
|
||||
if (isset($this->userAgentMap[$h])) {
|
||||
$ua = $this->userAgentMap[$h];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!isset($ua)) $ua = $this->userAgentDefault;
|
||||
if ($asArray) {
|
||||
return array('User-Agent' => $ua);
|
||||
} else {
|
||||
return 'User-Agent: '.$ua;
|
||||
}
|
||||
}
|
||||
|
||||
public function rewriteHashbangFragment($url) {
|
||||
// return $url if there's no '#!'
|
||||
if (strpos($url, '#!') === false) return $url;
|
||||
// split $url and rewrite
|
||||
// TODO: is SimplePie_IRI included?
|
||||
$iri = new SimplePie_IRI($url);
|
||||
$fragment = substr($iri->fragment, 1); // strip '!'
|
||||
$iri->fragment = null;
|
||||
if (isset($iri->query)) {
|
||||
parse_str($iri->query, $query);
|
||||
} else {
|
||||
$query = array();
|
||||
}
|
||||
$query['_escaped_fragment_'] = (string)$fragment;
|
||||
$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
|
||||
return $iri->get_iri();
|
||||
}
|
||||
|
||||
public function getUglyURL($url, $html) {
|
||||
if ($html == '') return false;
|
||||
$found = false;
|
||||
foreach ($this->ajaxTriggers as $string) {
|
||||
if (stripos($html, $string)) {
|
||||
$found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!$found) return false;
|
||||
$iri = new SimplePie_IRI($url);
|
||||
if (isset($iri->query)) {
|
||||
parse_str($iri->query, $query);
|
||||
} else {
|
||||
$query = array();
|
||||
}
|
||||
$query['_escaped_fragment_'] = '';
|
||||
$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
|
||||
return $iri->get_iri();
|
||||
}
|
||||
|
||||
public function removeFragment($url) {
|
||||
$pos = strpos($url, '#');
|
||||
if ($pos === false) {
|
||||
return $url;
|
||||
} else {
|
||||
return substr($url, 0, $pos);
|
||||
}
|
||||
}
|
||||
|
||||
public function rewriteUrls($url) {
|
||||
foreach ($this->rewriteUrls as $find => $action) {
|
||||
if (strpos($url, $find) !== false) {
|
||||
if (is_array($action)) {
|
||||
return strtr($url, $action);
|
||||
}
|
||||
}
|
||||
}
|
||||
return $url;
|
||||
}
|
||||
|
||||
public function enableDebug($bool=true) {
|
||||
$this->debug = (bool)$bool;
|
||||
}
|
||||
|
||||
public function minimiseMemoryUse($bool = true) {
|
||||
$this->minimiseMemoryUse = $bool;
|
||||
}
|
||||
|
||||
public function setMaxParallelRequests($max) {
|
||||
$this->maxParallelRequests = $max;
|
||||
}
|
||||
|
||||
public function validateUrl($url) {
|
||||
$url = filter_var($url, FILTER_SANITIZE_URL);
|
||||
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
|
||||
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
|
||||
if ($test === false) {
|
||||
$test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
|
||||
}
|
||||
if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
|
||||
return $url;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public function fetchAll(array $urls) {
|
||||
$this->fetchAllOnce($urls, $isRedirect=false);
|
||||
$redirects = 0;
|
||||
while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
|
||||
$this->debug("Following redirects #$redirects...");
|
||||
$this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
|
||||
}
|
||||
}
|
||||
|
||||
// fetch all URLs without following redirects
|
||||
public function fetchAllOnce(array $urls, $isRedirect=false) {
|
||||
if (!$isRedirect) $urls = array_unique($urls);
|
||||
if (empty($urls)) return;
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// parallel (HttpRequestPool)
|
||||
if ($this->method == self::METHOD_REQUEST_POOL) {
|
||||
$this->debug('Starting parallel fetch (HttpRequestPool)');
|
||||
try {
|
||||
while (count($urls) > 0) {
|
||||
$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
|
||||
$subset = array_splice($urls, 0, $this->maxParallelRequests);
|
||||
$pool = new HttpRequestPool();
|
||||
foreach ($subset as $orig => $url) {
|
||||
if (!$isRedirect) $orig = $url;
|
||||
unset($this->redirectQueue[$orig]);
|
||||
$this->debug("...$url");
|
||||
if (!$isRedirect && isset($this->requests[$url])) {
|
||||
$this->debug("......in memory");
|
||||
/*
|
||||
} elseif ($this->isCached($url)) {
|
||||
$this->debug("......is cached");
|
||||
if (!$this->minimiseMemoryUse) {
|
||||
$this->requests[$url] = $this->getCached($url);
|
||||
}
|
||||
*/
|
||||
} else {
|
||||
$this->debug("......adding to pool");
|
||||
$req_url = $this->rewriteUrls($url);
|
||||
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
|
||||
$req_url = $this->removeFragment($req_url);
|
||||
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
|
||||
$_meth = HttpRequest::METH_HEAD;
|
||||
} else {
|
||||
$_meth = HttpRequest::METH_GET;
|
||||
unset($this->requests[$orig]['wrongGuess']);
|
||||
}
|
||||
$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
|
||||
// send cookies, if we have any
|
||||
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
||||
$this->debug("......sending cookies: $cookies");
|
||||
$httpRequest->addHeaders(array('Cookie' => $cookies));
|
||||
}
|
||||
//$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
|
||||
$httpRequest->addHeaders($this->getUserAgent($req_url, true));
|
||||
// add referer for picky sites
|
||||
$httpRequest->addheaders(array('Referer' => $this->referer));
|
||||
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
|
||||
$this->requests[$orig]['original_url'] = $orig;
|
||||
$pool->attach($httpRequest);
|
||||
}
|
||||
}
|
||||
// did we get anything into the pool?
|
||||
if (count($pool) > 0) {
|
||||
$this->debug('Sending request...');
|
||||
try {
|
||||
$pool->send();
|
||||
} catch (HttpRequestPoolException $e) {
|
||||
// do nothing
|
||||
}
|
||||
$this->debug('Received responses');
|
||||
foreach($subset as $orig => $url) {
|
||||
if (!$isRedirect) $orig = $url;
|
||||
$request = $this->requests[$orig]['httpRequest'];
|
||||
//$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
|
||||
// getResponseHeader() doesn't return status line, so, for consistency...
|
||||
$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
|
||||
// check content type
|
||||
// TODO: use getResponseHeader('content-type') or getResponseInfo()
|
||||
if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
|
||||
$this->requests[$orig]['body'] = '';
|
||||
$_header_only_type = true;
|
||||
$this->debug('Header only type returned');
|
||||
} else {
|
||||
$this->requests[$orig]['body'] = $request->getResponseBody();
|
||||
$_header_only_type = false;
|
||||
}
|
||||
$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
|
||||
$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
|
||||
// is redirect?
|
||||
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
|
||||
$redirectURL = $request->getResponseHeader('location');
|
||||
if (!preg_match('!^https?://!i', $redirectURL)) {
|
||||
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
|
||||
}
|
||||
if ($this->validateURL($redirectURL)) {
|
||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||
// store any cookies
|
||||
$cookies = $request->getResponseHeader('set-cookie');
|
||||
if ($cookies && !is_array($cookies)) $cookies = array($cookies);
|
||||
if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
} else {
|
||||
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
||||
}
|
||||
} elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
|
||||
// the response content-type did not match our 'header only' types,
|
||||
// but we'd issues a HEAD request because we assumed it would. So
|
||||
// let's queue a proper GET request for this item...
|
||||
$this->debug('Wrong guess at content-type, queing GET request');
|
||||
$this->requests[$orig]['wrongGuess'] = true;
|
||||
$this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
|
||||
} elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
|
||||
// check for <meta name='fragment' content='!'/>
|
||||
// for AJAX sites, e.g. Blogger with its dynamic views templates.
|
||||
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
|
||||
if (isset($this->requests[$orig]['body'])) {
|
||||
$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
|
||||
if ($redirectURL) {
|
||||
$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
}
|
||||
}
|
||||
}
|
||||
//die($url.' -multi- '.$request->getResponseInfo('effective_url'));
|
||||
$pool->detach($request);
|
||||
unset($this->requests[$orig]['httpRequest'], $request);
|
||||
/*
|
||||
if ($this->minimiseMemoryUse) {
|
||||
if ($this->cache($url)) {
|
||||
unset($this->requests[$url]);
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (HttpException $e) {
|
||||
$this->debug($e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// parallel (curl_multi_*)
|
||||
elseif ($this->method == self::METHOD_CURL_MULTI) {
|
||||
$this->debug('Starting parallel fetch (curl_multi_*)');
|
||||
while (count($urls) > 0) {
|
||||
$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
|
||||
$subset = array_splice($urls, 0, $this->maxParallelRequests);
|
||||
$pool = new RollingCurl(array($this, 'handleCurlResponse'));
|
||||
$pool->window_size = count($subset);
|
||||
|
||||
foreach ($subset as $orig => $url) {
|
||||
if (!$isRedirect) $orig = $url;
|
||||
unset($this->redirectQueue[$orig]);
|
||||
$this->debug("...$url");
|
||||
if (!$isRedirect && isset($this->requests[$url])) {
|
||||
$this->debug("......in memory");
|
||||
/*
|
||||
} elseif ($this->isCached($url)) {
|
||||
$this->debug("......is cached");
|
||||
if (!$this->minimiseMemoryUse) {
|
||||
$this->requests[$url] = $this->getCached($url);
|
||||
}
|
||||
*/
|
||||
} else {
|
||||
$this->debug("......adding to pool");
|
||||
$req_url = $this->rewriteUrls($url);
|
||||
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
|
||||
$req_url = $this->removeFragment($req_url);
|
||||
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
|
||||
$_meth = 'HEAD';
|
||||
} else {
|
||||
$_meth = 'GET';
|
||||
unset($this->requests[$orig]['wrongGuess']);
|
||||
}
|
||||
$headers = array();
|
||||
//$headers[] = 'User-Agent: '.$this->userAgent;
|
||||
$headers[] = $this->getUserAgent($req_url);
|
||||
// add referer for picky sites
|
||||
$headers[] = 'Referer: '.$this->referer;
|
||||
// send cookies, if we have any
|
||||
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
||||
$this->debug("......sending cookies: $cookies");
|
||||
$headers[] = 'Cookie: '.$cookies;
|
||||
}
|
||||
$httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
|
||||
CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
|
||||
CURLOPT_TIMEOUT => $this->requestOptions['timeout']
|
||||
));
|
||||
$httpRequest->set_original_url($orig);
|
||||
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
|
||||
$this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
|
||||
$pool->add($httpRequest);
|
||||
}
|
||||
}
|
||||
// did we get anything into the pool?
|
||||
if (count($pool) > 0) {
|
||||
$this->debug('Sending request...');
|
||||
$pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
|
||||
$this->debug('Received responses');
|
||||
foreach($subset as $orig => $url) {
|
||||
if (!$isRedirect) $orig = $url;
|
||||
// $this->requests[$orig]['headers']
|
||||
// $this->requests[$orig]['body']
|
||||
// $this->requests[$orig]['effective_url']
|
||||
// check content type
|
||||
if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
|
||||
$this->requests[$orig]['body'] = '';
|
||||
$_header_only_type = true;
|
||||
$this->debug('Header only type returned');
|
||||
} else {
|
||||
$_header_only_type = false;
|
||||
}
|
||||
$status_code = $this->requests[$orig]['status_code'];
|
||||
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
|
||||
$redirectURL = $this->requests[$orig]['location'];
|
||||
if (!preg_match('!^https?://!i', $redirectURL)) {
|
||||
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
|
||||
}
|
||||
if ($this->validateURL($redirectURL)) {
|
||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||
// store any cookies
|
||||
$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
|
||||
if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
} else {
|
||||
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
||||
}
|
||||
} elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {
|
||||
// the response content-type did not match our 'header only' types,
|
||||
// but we'd issues a HEAD request because we assumed it would. So
|
||||
// let's queue a proper GET request for this item...
|
||||
$this->debug('Wrong guess at content-type, queing GET request');
|
||||
$this->requests[$orig]['wrongGuess'] = true;
|
||||
$this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
|
||||
} elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
|
||||
// check for <meta name='fragment' content='!'/>
|
||||
// for AJAX sites, e.g. Blogger with its dynamic views templates.
|
||||
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
|
||||
if (isset($this->requests[$orig]['body'])) {
|
||||
$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
|
||||
if ($redirectURL) {
|
||||
$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
}
|
||||
}
|
||||
}
|
||||
// die($url.' -multi- '.$request->getResponseInfo('effective_url'));
|
||||
unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// sequential (file_get_contents)
|
||||
else {
|
||||
$this->debug('Starting sequential fetch (file_get_contents)');
|
||||
$this->debug('Processing set of '.count($urls));
|
||||
foreach ($urls as $orig => $url) {
|
||||
if (!$isRedirect) $orig = $url;
|
||||
unset($this->redirectQueue[$orig]);
|
||||
$this->debug("...$url");
|
||||
if (!$isRedirect && isset($this->requests[$url])) {
|
||||
$this->debug("......in memory");
|
||||
/*
|
||||
} elseif ($this->isCached($url)) {
|
||||
$this->debug("......is cached");
|
||||
if (!$this->minimiseMemoryUse) {
|
||||
$this->requests[$url] = $this->getCached($url);
|
||||
}
|
||||
*/
|
||||
} else {
|
||||
$this->debug("Sending request for $url");
|
||||
$this->requests[$orig]['original_url'] = $orig;
|
||||
$req_url = $this->rewriteUrls($url);
|
||||
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
|
||||
$req_url = $this->removeFragment($req_url);
|
||||
// send cookies, if we have any
|
||||
$httpContext = $this->httpContext;
|
||||
$httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
|
||||
// add referer for picky sites
|
||||
$httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
|
||||
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
||||
$this->debug("......sending cookies: $cookies");
|
||||
$httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
|
||||
}
|
||||
if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
|
||||
$this->debug('Received response');
|
||||
// get status code
|
||||
if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
|
||||
$this->debug('Error: no status code found');
|
||||
// TODO: handle error - no status code
|
||||
} else {
|
||||
$this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
|
||||
// check content type
|
||||
if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
|
||||
$this->requests[$orig]['body'] = '';
|
||||
} else {
|
||||
$this->requests[$orig]['body'] = $html;
|
||||
}
|
||||
$this->requests[$orig]['effective_url'] = $req_url;
|
||||
$this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
|
||||
unset($match);
|
||||
// handle redirect
|
||||
if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
|
||||
$this->requests[$orig]['location'] = trim($match[1]);
|
||||
}
|
||||
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
|
||||
$redirectURL = $this->requests[$orig]['location'];
|
||||
if (!preg_match('!^https?://!i', $redirectURL)) {
|
||||
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
|
||||
}
|
||||
if ($this->validateURL($redirectURL)) {
|
||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||
// store any cookies
|
||||
$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
|
||||
if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
} else {
|
||||
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
||||
}
|
||||
} elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
|
||||
// check for <meta name='fragment' content='!'/>
|
||||
// for AJAX sites, e.g. Blogger with its dynamic views templates.
|
||||
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
|
||||
if (isset($this->requests[$orig]['body'])) {
|
||||
$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
|
||||
if ($redirectURL) {
|
||||
$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$this->debug('Error retrieving URL');
|
||||
//print_r($req_url);
|
||||
//print_r($http_response_header);
|
||||
//print_r($html);
|
||||
|
||||
// TODO: handle error - failed to retrieve URL
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public function handleCurlResponse($response, $info, $request) {
|
||||
$orig = $request->url_original;
|
||||
$this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
|
||||
$this->requests[$orig]['body'] = substr($response, $info['header_size']);
|
||||
$this->requests[$orig]['method'] = $request->method;
|
||||
$this->requests[$orig]['effective_url'] = $info['url'];
|
||||
$this->requests[$orig]['status_code'] = (int)$info['http_code'];
|
||||
if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
|
||||
$this->requests[$orig]['location'] = trim($match[1]);
|
||||
}
|
||||
}
|
||||
|
||||
protected function headersToString(array $headers, $associative=true) {
|
||||
if (!$associative) {
|
||||
return implode("\n", $headers);
|
||||
} else {
|
||||
$str = '';
|
||||
foreach ($headers as $key => $val) {
|
||||
if (is_array($val)) {
|
||||
foreach ($val as $v) $str .= "$key: $v\n";
|
||||
} else {
|
||||
$str .= "$key: $val\n";
|
||||
}
|
||||
}
|
||||
return rtrim($str);
|
||||
}
|
||||
}
|
||||
|
||||
public function get($url, $remove=false, $gzdecode=true) {
|
||||
$url = "$url";
|
||||
if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
|
||||
$this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
|
||||
$response = $this->requests[$url];
|
||||
/*
|
||||
} elseif ($this->isCached($url)) {
|
||||
$this->debug("URL already fetched - in disk cache ($url)");
|
||||
$response = $this->getCached($url);
|
||||
$this->requests[$url] = $response;
|
||||
*/
|
||||
} else {
|
||||
$this->debug("Fetching URL ($url)");
|
||||
$this->fetchAll(array($url));
|
||||
if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
|
||||
$response = $this->requests[$url];
|
||||
} else {
|
||||
$this->debug("Request failed");
|
||||
$response = false;
|
||||
}
|
||||
}
|
||||
/*
|
||||
if ($this->minimiseMemoryUse && $response) {
|
||||
$this->cache($url);
|
||||
unset($this->requests[$url]);
|
||||
}
|
||||
*/
|
||||
if ($remove && $response) unset($this->requests[$url]);
|
||||
if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
|
||||
if ($html = gzdecode($response['body'])) {
|
||||
$response['body'] = $html;
|
||||
}
|
||||
}
|
||||
return $response;
|
||||
}
|
||||
|
||||
public function parallelSupport() {
|
||||
return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
|
||||
}
|
||||
|
||||
private function headerOnlyType($headers) {
|
||||
if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {
|
||||
// look for full mime type (e.g. image/jpeg) or just type (e.g. image)
|
||||
$match[1] = strtolower(trim($match[1]));
|
||||
$match[2] = strtolower(trim($match[2]));
|
||||
foreach (array($match[1], $match[2]) as $mime) {
|
||||
if (in_array($mime, $this->headerOnlyTypes)) return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private function possibleUnsupportedType($url) {
|
||||
$path = @parse_url($url, PHP_URL_PATH);
|
||||
if ($path && strpos($path, '.') !== false) {
|
||||
$ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));
|
||||
return in_array($ext, $this->headerOnlyClues);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
|
||||
if (!function_exists('gzdecode')) {
|
||||
function gzdecode($data,&$filename='',&$error='',$maxlength=null)
|
||||
{
|
||||
$len = strlen($data);
|
||||
if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
|
||||
$error = "Not in GZIP format.";
|
||||
return null; // Not GZIP format (See RFC 1952)
|
||||
}
|
||||
$method = ord(substr($data,2,1)); // Compression method
|
||||
$flags = ord(substr($data,3,1)); // Flags
|
||||
if ($flags & 31 != $flags) {
|
||||
$error = "Reserved bits not allowed.";
|
||||
return null;
|
||||
}
|
||||
// NOTE: $mtime may be negative (PHP integer limitations)
|
||||
$mtime = unpack("V", substr($data,4,4));
|
||||
$mtime = $mtime[1];
|
||||
$xfl = substr($data,8,1);
|
||||
$os = substr($data,8,1);
|
||||
$headerlen = 10;
|
||||
$extralen = 0;
|
||||
$extra = "";
|
||||
if ($flags & 4) {
|
||||
// 2-byte length prefixed EXTRA data in header
|
||||
if ($len - $headerlen - 2 < 8) {
|
||||
return false; // invalid
|
||||
}
|
||||
$extralen = unpack("v",substr($data,8,2));
|
||||
$extralen = $extralen[1];
|
||||
if ($len - $headerlen - 2 - $extralen < 8) {
|
||||
return false; // invalid
|
||||
}
|
||||
$extra = substr($data,10,$extralen);
|
||||
$headerlen += 2 + $extralen;
|
||||
}
|
||||
$filenamelen = 0;
|
||||
$filename = "";
|
||||
if ($flags & 8) {
|
||||
// C-style string
|
||||
if ($len - $headerlen - 1 < 8) {
|
||||
return false; // invalid
|
||||
}
|
||||
$filenamelen = strpos(substr($data,$headerlen),chr(0));
|
||||
if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
|
||||
return false; // invalid
|
||||
}
|
||||
$filename = substr($data,$headerlen,$filenamelen);
|
||||
$headerlen += $filenamelen + 1;
|
||||
}
|
||||
$commentlen = 0;
|
||||
$comment = "";
|
||||
if ($flags & 16) {
|
||||
// C-style string COMMENT data in header
|
||||
if ($len - $headerlen - 1 < 8) {
|
||||
return false; // invalid
|
||||
}
|
||||
$commentlen = strpos(substr($data,$headerlen),chr(0));
|
||||
if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
|
||||
return false; // Invalid header format
|
||||
}
|
||||
$comment = substr($data,$headerlen,$commentlen);
|
||||
$headerlen += $commentlen + 1;
|
||||
}
|
||||
$headercrc = "";
|
||||
if ($flags & 2) {
|
||||
// 2-bytes (lowest order) of CRC32 on header present
|
||||
if ($len - $headerlen - 2 < 8) {
|
||||
return false; // invalid
|
||||
}
|
||||
$calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
|
||||
$headercrc = unpack("v", substr($data,$headerlen,2));
|
||||
$headercrc = $headercrc[1];
|
||||
if ($headercrc != $calccrc) {
|
||||
$error = "Header checksum failed.";
|
||||
return false; // Bad header CRC
|
||||
}
|
||||
$headerlen += 2;
|
||||
}
|
||||
// GZIP FOOTER
|
||||
$datacrc = unpack("V",substr($data,-8,4));
|
||||
$datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);
|
||||
$isize = unpack("V",substr($data,-4));
|
||||
$isize = $isize[1];
|
||||
// decompression:
|
||||
$bodylen = $len-$headerlen-8;
|
||||
if ($bodylen < 1) {
|
||||
// IMPLEMENTATION BUG!
|
||||
return null;
|
||||
}
|
||||
$body = substr($data,$headerlen,$bodylen);
|
||||
$data = "";
|
||||
if ($bodylen > 0) {
|
||||
switch ($method) {
|
||||
case 8:
|
||||
// Currently the only supported compression method:
|
||||
$data = gzinflate($body,$maxlength);
|
||||
break;
|
||||
default:
|
||||
$error = "Unknown compression method.";
|
||||
return false;
|
||||
}
|
||||
} // zero-byte body content is allowed
|
||||
// Verifiy CRC32
|
||||
$crc = sprintf("%u",crc32($data));
|
||||
$crcOK = $crc == $datacrc;
|
||||
$lenOK = $isize == strlen($data);
|
||||
if (!$lenOK || !$crcOK) {
|
||||
$error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');
|
||||
return false;
|
||||
}
|
||||
return $data;
|
||||
}
|
||||
}
|
||||
?>
|
402
inc/3rdparty/libraries/humble-http-agent/RollingCurl.php
vendored
Normal file
402
inc/3rdparty/libraries/humble-http-agent/RollingCurl.php
vendored
Normal file
|
@ -0,0 +1,402 @@
|
|||
<?php
|
||||
/*
|
||||
Authored by Josh Fraser (www.joshfraser.com)
|
||||
Released under Apache License 2.0
|
||||
|
||||
Maintained by Alexander Makarov, http://rmcreative.ru/
|
||||
|
||||
Modified by Keyvan Minoukadeh for the Five Filters project: http://fivefilters.org
|
||||
*/
|
||||
|
||||
/**
|
||||
* Class that represent a single curl request
|
||||
*/
|
||||
class RollingCurlRequest {
|
||||
public $url = false;
|
||||
public $url_original = false; // used for tracking redirects
|
||||
public $method = 'GET';
|
||||
public $post_data = null;
|
||||
public $headers = null;
|
||||
public $options = null;
|
||||
|
||||
/**
|
||||
* @param string $url
|
||||
* @param string $method
|
||||
* @param $post_data
|
||||
* @param $headers
|
||||
* @param $options
|
||||
* @return void
|
||||
*/
|
||||
function __construct($url, $method = "GET", $post_data = null, $headers = null, $options = null) {
|
||||
$this->url = $url;
|
||||
$this->url_original = $url;
|
||||
$this->method = $method;
|
||||
$this->post_data = $post_data;
|
||||
$this->headers = $headers;
|
||||
$this->options = $options;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $url
|
||||
* @return void
|
||||
*/
|
||||
public function set_original_url($url) {
|
||||
$this->url_original = $url;
|
||||
}
|
||||
/**
|
||||
* @return void
|
||||
*/
|
||||
public function __destruct() {
|
||||
unset($this->url, $this->url_original, $this->method, $this->post_data, $this->headers, $this->options);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* RollingCurl custom exception
|
||||
*/
|
||||
class RollingCurlException extends Exception {
|
||||
}
|
||||
|
||||
/**
|
||||
* Class that holds a rolling queue of curl requests.
|
||||
*
|
||||
* @throws RollingCurlException
|
||||
*/
|
||||
class RollingCurl implements Countable {
|
||||
/**
|
||||
* @var int
|
||||
*
|
||||
* Window size is the max number of simultaneous connections allowed.
|
||||
*
|
||||
* REMEMBER TO RESPECT THE SERVERS:
|
||||
* Sending too many requests at one time can easily be perceived
|
||||
* as a DOS attack. Increase this window_size if you are making requests
|
||||
* to multiple servers or have permission from the receving server admins.
|
||||
*/
|
||||
private $window_size = 5;
|
||||
|
||||
/**
|
||||
* @var float
|
||||
*
|
||||
* Timeout is the timeout used for curl_multi_select.
|
||||
*/
|
||||
private $timeout = 10;
|
||||
|
||||
/**
|
||||
* @var string|array
|
||||
*
|
||||
* Callback function to be applied to each result.
|
||||
*/
|
||||
private $callback;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*
|
||||
* Set your base options that you want to be used with EVERY request.
|
||||
*/
|
||||
protected $options = array(
|
||||
CURLOPT_SSL_VERIFYPEER => 0,
|
||||
CURLOPT_RETURNTRANSFER => 1,
|
||||
CURLOPT_CONNECTTIMEOUT => 30,
|
||||
CURLOPT_TIMEOUT => 30
|
||||
);
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $headers = array();
|
||||
|
||||
/**
|
||||
* @var Request[]
|
||||
*
|
||||
* The request queue
|
||||
*/
|
||||
private $requests = array();
|
||||
|
||||
/**
|
||||
* @var RequestMap[]
|
||||
*
|
||||
* Maps handles to request indexes
|
||||
*/
|
||||
private $requestMap = array();
|
||||
|
||||
/**
|
||||
* @param $callback
|
||||
* Callback function to be applied to each result.
|
||||
*
|
||||
* Can be specified as 'my_callback_function'
|
||||
* or array($object, 'my_callback_method').
|
||||
*
|
||||
* Function should take three parameters: $response, $info, $request.
|
||||
* $response is response body, $info is additional curl info.
|
||||
* $request is the original request
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
function __construct($callback = null) {
|
||||
$this->callback = $callback;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $name
|
||||
* @return mixed
|
||||
*/
|
||||
public function __get($name) {
|
||||
return (isset($this->{$name})) ? $this->{$name} : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $name
|
||||
* @param mixed $value
|
||||
* @return bool
|
||||
*/
|
||||
public function __set($name, $value) {
|
||||
// append the base options & headers
|
||||
if ($name == "options" || $name == "headers") {
|
||||
$this->{$name} = $value + $this->{$name};
|
||||
} else {
|
||||
$this->{$name} = $value;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Count number of requests added (Countable interface)
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
public function count() {
|
||||
return count($this->requests);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a request to the request queue
|
||||
*
|
||||
* @param Request $request
|
||||
* @return bool
|
||||
*/
|
||||
public function add($request) {
|
||||
$this->requests[] = $request;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create new Request and add it to the request queue
|
||||
*
|
||||
* @param string $url
|
||||
* @param string $method
|
||||
* @param $post_data
|
||||
* @param $headers
|
||||
* @param $options
|
||||
* @return bool
|
||||
*/
|
||||
public function request($url, $method = "GET", $post_data = null, $headers = null, $options = null) {
|
||||
$this->requests[] = new RollingCurlRequest($url, $method, $post_data, $headers, $options);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform GET request
|
||||
*
|
||||
* @param string $url
|
||||
* @param $headers
|
||||
* @param $options
|
||||
* @return bool
|
||||
*/
|
||||
public function get($url, $headers = null, $options = null) {
|
||||
return $this->request($url, "GET", null, $headers, $options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform POST request
|
||||
*
|
||||
* @param string $url
|
||||
* @param $post_data
|
||||
* @param $headers
|
||||
* @param $options
|
||||
* @return bool
|
||||
*/
|
||||
public function post($url, $post_data = null, $headers = null, $options = null) {
|
||||
return $this->request($url, "POST", $post_data, $headers, $options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute processing
|
||||
*
|
||||
* @param int $window_size Max number of simultaneous connections
|
||||
* @return string|bool
|
||||
*/
|
||||
public function execute($window_size = null) {
|
||||
// rolling curl window must always be greater than 1
|
||||
if (sizeof($this->requests) == 1) {
|
||||
return $this->single_curl();
|
||||
} else {
|
||||
// start the rolling curl. window_size is the max number of simultaneous connections
|
||||
return $this->rolling_curl($window_size);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs a single curl request
|
||||
*
|
||||
* @access private
|
||||
* @return string
|
||||
*/
|
||||
private function single_curl() {
|
||||
$ch = curl_init();
|
||||
$request = array_shift($this->requests);
|
||||
$options = $this->get_options($request);
|
||||
curl_setopt_array($ch, $options);
|
||||
$output = curl_exec($ch);
|
||||
$info = curl_getinfo($ch);
|
||||
|
||||
// it's not neccesary to set a callback for one-off requests
|
||||
if ($this->callback) {
|
||||
$callback = $this->callback;
|
||||
if (is_callable($this->callback)) {
|
||||
call_user_func($callback, $output, $info, $request);
|
||||
}
|
||||
}
|
||||
else
|
||||
return $output;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs multiple curl requests
|
||||
*
|
||||
* @access private
|
||||
* @throws RollingCurlException
|
||||
* @param int $window_size Max number of simultaneous connections
|
||||
* @return bool
|
||||
*/
|
||||
private function rolling_curl($window_size = null) {
|
||||
if ($window_size)
|
||||
$this->window_size = $window_size;
|
||||
|
||||
// make sure the rolling window isn't greater than the # of urls
|
||||
if (sizeof($this->requests) < $this->window_size)
|
||||
$this->window_size = sizeof($this->requests);
|
||||
|
||||
if ($this->window_size < 2) {
|
||||
throw new RollingCurlException("Window size must be greater than 1");
|
||||
}
|
||||
|
||||
$master = curl_multi_init();
|
||||
|
||||
// start the first batch of requests
|
||||
for ($i = 0; $i < $this->window_size; $i++) {
|
||||
$ch = curl_init();
|
||||
|
||||
$options = $this->get_options($this->requests[$i]);
|
||||
|
||||
curl_setopt_array($ch, $options);
|
||||
curl_multi_add_handle($master, $ch);
|
||||
|
||||
// Add to our request Maps
|
||||
$key = (string) $ch;
|
||||
$this->requestMap[$key] = $i;
|
||||
}
|
||||
|
||||
do {
|
||||
while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM) ;
|
||||
if ($execrun != CURLM_OK)
|
||||
break;
|
||||
// a request was just completed -- find out which one
|
||||
while ($done = curl_multi_info_read($master)) {
|
||||
|
||||
// get the info and content returned on the request
|
||||
$info = curl_getinfo($done['handle']);
|
||||
$output = curl_multi_getcontent($done['handle']);
|
||||
|
||||
// send the return values to the callback function.
|
||||
$callback = $this->callback;
|
||||
if (is_callable($callback)) {
|
||||
$key = (string) $done['handle'];
|
||||
$request = $this->requests[$this->requestMap[$key]];
|
||||
unset($this->requestMap[$key]);
|
||||
call_user_func($callback, $output, $info, $request);
|
||||
}
|
||||
|
||||
// start a new request (it's important to do this before removing the old one)
|
||||
if ($i < sizeof($this->requests) && isset($this->requests[$i]) && $i < count($this->requests)) {
|
||||
$ch = curl_init();
|
||||
$options = $this->get_options($this->requests[$i]);
|
||||
curl_setopt_array($ch, $options);
|
||||
curl_multi_add_handle($master, $ch);
|
||||
|
||||
// Add to our request Maps
|
||||
$key = (string) $ch;
|
||||
$this->requestMap[$key] = $i;
|
||||
$i++;
|
||||
}
|
||||
|
||||
// remove the curl handle that just completed
|
||||
curl_multi_remove_handle($master, $done['handle']);
|
||||
|
||||
}
|
||||
|
||||
// Block for data in / output; error handling is done by curl_multi_exec
|
||||
//if ($running) curl_multi_select($master, $this->timeout);
|
||||
// removing timeout as it causes problems on Windows with PHP 5.3.5 and Curl 7.20.0
|
||||
if ($running) curl_multi_select($master);
|
||||
|
||||
} while ($running);
|
||||
curl_multi_close($master);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Helper function to set up a new request by setting the appropriate options
|
||||
*
|
||||
* @access private
|
||||
* @param Request $request
|
||||
* @return array
|
||||
*/
|
||||
private function get_options($request) {
|
||||
// options for this entire curl object
|
||||
$options = $this->__get('options');
|
||||
// We're managing reirects in PHP - allows us to intervene and rewrite/block URLs
|
||||
// before the next request goes out.
|
||||
$options[CURLOPT_FOLLOWLOCATION] = 0;
|
||||
$options[CURLOPT_MAXREDIRS] = 0;
|
||||
//if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode')) {
|
||||
// $options[CURLOPT_FOLLOWLOCATION] = 1;
|
||||
// $options[CURLOPT_MAXREDIRS] = 5;
|
||||
//}
|
||||
$headers = $this->__get('headers');
|
||||
// append custom headers for this specific request
|
||||
if ($request->headers) {
|
||||
$headers = $headers + $request->headers;
|
||||
}
|
||||
|
||||
// append custom options for this specific request
|
||||
if ($request->options) {
|
||||
$options = $request->options + $options;
|
||||
}
|
||||
|
||||
// set the request URL
|
||||
$options[CURLOPT_URL] = $request->url;
|
||||
|
||||
if ($headers) {
|
||||
$options[CURLOPT_HTTPHEADER] = $headers;
|
||||
}
|
||||
// return response headers
|
||||
$options[CURLOPT_HEADER] = 1;
|
||||
|
||||
// send HEAD request?
|
||||
if ($request->method == 'HEAD') {
|
||||
$options[CURLOPT_NOBODY] = 1;
|
||||
}
|
||||
|
||||
return $options;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return void
|
||||
*/
|
||||
public function __destruct() {
|
||||
unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests);
|
||||
}
|
||||
}
|
79
inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
vendored
Normal file
79
inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
vendored
Normal file
|
@ -0,0 +1,79 @@
|
|||
<?php
|
||||
/**
|
||||
* Humble HTTP Agent extension for SimplePie_File
|
||||
*
|
||||
* This class is designed to extend and override SimplePie_File
|
||||
* in order to prevent duplicate HTTP requests being sent out.
|
||||
* The idea is to initialise an instance of Humble HTTP Agent
|
||||
* and attach it, to a static class variable, of this class.
|
||||
* SimplePie will then automatically initialise this class
|
||||
*
|
||||
* @date 2011-02-28
|
||||
*/
|
||||
|
||||
class SimplePie_HumbleHttpAgent extends SimplePie_File
|
||||
{
|
||||
protected static $agent;
|
||||
var $url;
|
||||
var $useragent;
|
||||
var $success = true;
|
||||
var $headers = array();
|
||||
var $body;
|
||||
var $status_code;
|
||||
var $redirects = 0;
|
||||
var $error;
|
||||
var $method = SIMPLEPIE_FILE_SOURCE_NONE;
|
||||
|
||||
public static function set_agent(HumbleHttpAgent $agent) {
|
||||
self::$agent = $agent;
|
||||
}
|
||||
|
||||
public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) {
|
||||
if (class_exists('idna_convert'))
|
||||
{
|
||||
$idn = new idna_convert();
|
||||
$parsed = SimplePie_Misc::parse_url($url);
|
||||
$url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']);
|
||||
}
|
||||
$this->url = $url;
|
||||
$this->useragent = $useragent;
|
||||
if (preg_match('/^http(s)?:\/\//i', $url))
|
||||
{
|
||||
if (!is_array($headers))
|
||||
{
|
||||
$headers = array();
|
||||
}
|
||||
$this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL;
|
||||
$headers2 = array();
|
||||
foreach ($headers as $key => $value) {
|
||||
$headers2[] = "$key: $value";
|
||||
}
|
||||
//TODO: allow for HTTP headers
|
||||
// curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2);
|
||||
|
||||
$response = self::$agent->get($url);
|
||||
|
||||
if ($response === false || !isset($response['status_code'])) {
|
||||
$this->error = 'failed to fetch URL';
|
||||
$this->success = false;
|
||||
} else {
|
||||
// The extra lines at the end are there to satisfy SimplePie's HTTP parser.
|
||||
// The class expects a full HTTP message, whereas we're giving it only
|
||||
// headers - the new lines indicate the start of the body.
|
||||
$parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n");
|
||||
if ($parser->parse()) {
|
||||
$this->headers = $parser->headers;
|
||||
//$this->body = $parser->body;
|
||||
$this->body = $response['body'];
|
||||
$this->status_code = $parser->status_code;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
$this->error = 'invalid URL';
|
||||
$this->success = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
?>
|
Loading…
Add table
Add a link
Reference in a new issue