@fivefilters via composer

This commit is contained in:
Nicolas Lœuillet 2015-01-19 21:27:22 +01:00
parent 9e7f6caf03
commit c78c1a3f08
1033 changed files with 42 additions and 26775 deletions

View file

@ -46,6 +46,10 @@
{
"type": "vcs",
"url": "https://github.com/wallabag/phpMobi"
},
{
"type": "vcs",
"url": "https://github.com/wallabag/Fivefilters_Libraries"
}
],
"require": {
@ -64,7 +68,8 @@
"wallabag/pagination": "dev-master",
"wallabag/PHPePub": "dev-master",
"wallabag/php-readability": "dev-master",
"wallabag/phpMobi": "dev-master"
"wallabag/phpMobi": "dev-master",
"wallabag/Fivefilters_Libraries": "dev-master"
},
"require-dev": {
"phpunit/phpunit": "~3.7"

35
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at http://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"hash": "4caad70325b5cf76eb8d4011bae06cbb",
"hash": "93ca2016541bb1e9aaf52a82468ce04d",
"packages": [
{
"name": "ezyang/htmlpurifier",
@ -1751,6 +1751,36 @@
"description": "The Twig Gettext Extractor is Poedit friendly tool which extracts translations from twig templates.",
"time": "2013-02-14 16:41:48"
},
{
"name": "wallabag/Fivefilters_Libraries",
"version": "dev-master",
"source": {
"type": "git",
"url": "https://github.com/wallabag/Fivefilters_Libraries.git",
"reference": "93b5469d6036a68ce60f126f8430b3e1fbf1562f"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/wallabag/Fivefilters_Libraries/zipball/93b5469d6036a68ce60f126f8430b3e1fbf1562f",
"reference": "93b5469d6036a68ce60f126f8430b3e1fbf1562f",
"shasum": ""
},
"type": "library",
"authors": [
{
"name": "Nicolas Lœuillet",
"email": "nicolas@loeuillet.org",
"homepage": "http://www.cdetc.fr"
}
],
"description": "Libraries from @fivefilters.",
"homepage": "https://github.com/wallabag/Fivefilters_Libraries",
"support": {
"source": "https://github.com/wallabag/Fivefilters_Libraries/tree/master",
"issues": "https://github.com/wallabag/Fivefilters_Libraries/issues"
},
"time": "2015-01-19 20:19:28"
},
{
"name": "wallabag/PHPePub",
"version": "dev-master",
@ -2390,7 +2420,8 @@
"wallabag/pagination": 20,
"wallabag/phpepub": 20,
"wallabag/php-readability": 20,
"wallabag/phpmobi": 20
"wallabag/phpmobi": 20,
"wallabag/fivefilters_libraries": 20
},
"prefer-stable": false,
"prefer-lowest": false,

View file

@ -1,461 +0,0 @@
<?php
/* Full-Text RSS config */
// ......IMPORTANT......................................
// .....................................................
// Please do not change this file (config.php) directly.
// Save a copy as custom_config.php and make your
// changes to that instead. It will automatically
// override anything in config.php. Because config.php
// always gets loaded anyway, you can simply specify
// options you'd like to override in custom_config.php.
// .....................................................
global $options;
// Create config object
if (!isset($options)) $options = new stdClass();
// Enable service
// ----------------------
// Set this to false if you want to disable the service.
// If set to false, no feed is produced and users will
// be told that the service is disabled.
$options->enabled = true;
// Debug mode
// ----------------------
// Enable or disable debugging. When enabled debugging works by passing
// &debug to the makefulltextfeed.php querystring.
// Valid values:
// true or 'user' (default) - let user decide
// 'admin' - debug works only for logged in admin users
// false - disabled
$options->debug = true;
// Default entries (without access key)
// ----------------------
// The number of feed items to process when no API key is supplied
// and no &max=x value is supplied in the querystring.
$options->default_entries = 5;
// Max entries (without access key)
// ----------------------
// The maximum number of feed items to process when no access key is supplied.
// This limits the user-supplied &max=x value. For example, if the user
// asks for 20 items to be processed (&max=20), if max_entries is set to
// 10, only 10 will be processed.
$options->max_entries = 10;
// Full content
// ----------------------
// By default Full-Text RSS includes the extracted content in the output.
// You can exclude this from the output by passing '&content=0' in the querystring.
//
// Possible values...
// Always include: true
// Never include: false
// Include unless user overrides (&content=0): 'user' (default)
//
// Note: currently this does not disable full content extraction. It simply omits it
// from the output.
$options->content = 'user';
// Excerpts
// ----------------------
// By default Full-Text RSS does not include excerpts in the output.
// You can enable this by passing '&summary=1' in the querystring.
// This will include a plain text excerpt from the extracted content.
//
// Possible values...
// Always include: true (recommended for new users)
// Never include: false
// Don't include unless user overrides (&summary=1): 'user' (default)
//
// Important: if both content and excerpts are requested, the excerpt will be
// placed in the description element and the full content inside content:encoded.
// If excerpts are not requested, the full content will go inside the description element.
//
// Why are we not returning both excerpts and content by default?
// Mainly for backward compatibility.
// Excerpts should appear in the feed item's description element. Previous versions
// of Full-Text RSS did not return excerpts, so the description element was always
// used for the full content (as recommended by the RSS advisory). When returning both,
// we need somewhere else to place the content (content:encoded).
// Having both enabled should not create any problems for news readers, but it may create
// problems for developers upgrading from one of our earlier versions who may now find
// their applications are returning excerpts instead of the full content they were
// expecting. To avoid such surprises for users who are upgrading Full-Text RSS,
// excerpts must be explicitly requested in the querystring by default.
//
// Why not use a different element name for excerpts?
// According to the RSS advisory:
// "Publishers who employ summaries should store the summary in description and
// the full content in content:encoded, ordering description first within the item.
// On items with no summary, the full content should be stored in description."
// See: http://www.rssboard.org/rss-profile#namespace-elements-content-encoded
//
// For more consistent element naming, we recommend new users set this option to true.
// The full content can still be excluded via the querystring, but the element names
// will not change: when $options->summary = true, the description element will always
// be reserved for the excerpt and content:encoded always for full content.
$options->summary = 'user';
// Rewrite relative URLs
// ----------------------
// With this enabled relative URLs found in the extracted content
// block are automatically rewritten as absolute URLs.
$options->rewrite_relative_urls = true;
// Exclude items if extraction fails
// ---------------------------------
// Excludes items from the resulting feed
// if we cannot extract any content from the
// item URL.
// Possible values...
// Enable: true
// Disable: false (default)
// User decides: 'user' (this option will appear on the form)
$options->exclude_items_on_fail = 'user';
// Enable multi-page support
// -------------------------
// If enabled, we will try to follow next page links on multi-page articles.
// Currently this only happens for sites where next_page_link has been defined
// in a site config file.
$options->multipage = true;
// Enable caching
// ----------------------
// Enable this if you'd like to cache results
// for 10 minutes. Cache files are written to disk (in cache/ subfolders
// - which must be writable).
// Initially it's best to keep this disabled to make sure everything works
// as expected. If you have APC enabled, please also see smart_cache in the
// advanced section.
$options->caching = false;
// Cache directory
// ----------------------
// Only used if caching is true
$options->cache_dir = dirname(__FILE__).'/cache';
// Message to prepend (without access key)
// ----------------------
// HTML to insert at the beginning of each feed item when no access key is supplied.
// Substitution tags:
// {url} - Feed item URL
// {effective-url} - Feed item URL after we've followed all redirects
$options->message_to_prepend = '';
// Message to append (without access key)
// ----------------------
// HTML to insert at the end of each feed item when no access key is supplied.
// Substitution tags:
// {url} - Feed item URL
// {effective-url} - Feed item URL after we've followed all redirects
$options->message_to_append = '';
// Error message when content extraction fails (without access key)
// ----------------------
$options->error_message = '[unable to retrieve full-text content]';
// Keep enclosure in feed items
// If enabled, we will try to preserve enclosures if present.
// ----------------------
$options->keep_enclosures = true;
// Detect language
// ---------------
// Should we try and find/guess the language of the article being processed?
// Values will be placed inside the <dc:language> element inside each <item> element
// Possible values:
// * Ignore language: 0
// * Use article/feed metadata (e.g. HTML lang attribute): 1 (default)
// * As above, but guess if not present: 2
// * Always guess: 3
// * User decides: 'user' (value of 0-3 can be passed in querystring: e.g. &l=2)
$options->detect_language = 1;
// Registration key
// ---------------
// The registration key is optional. It is not required to use Full-Text RSS,
// and does not affect the normal operation of Full-Text RSS. It is currently
// only used on admin pages which help you update site patterns with the
// latest version offered by FiveFilters.org. For these admin-related
// tasks to complete, we will require a valid registration key.
// If you would like one, you can purchase the latest version of Full-Text RSS
// at http://fivefilters.org/content-only/
// Your registration key will automatically be sent in the confirmation email.
// Once you have it, simply copy and paste it here.
$options->registration_key = '';
/////////////////////////////////////////////////
/// RESTRICT ACCESS /////////////////////////////
/////////////////////////////////////////////////
// Admin credentials
// ----------------------
// Certain pages/actions, e.g. updating site patterns with our online tool, will require admin credentials.
// To use these pages, enter a password here and you'll be prompted for it when you try to access those pages.
// If no password or username is set, pages requiring admin privelages will be inaccessible.
// The default username is 'admin'.
// If overriding with an environment variable, separate username and password with a colon, e.g.:
// ftr_admin_credentials: admin:my-secret-password
// Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password');
$options->admin_credentials = array('username'=>'admin', 'password'=>'');
// URLs to allow
// ----------------------
// List of URLs (or parts of a URL) which the service will accept.
// If the list is empty, all URLs (except those specified in the blocked list below)
// will be permitted.
// Empty: array();
// Non-empty example: array('example.com', 'anothersite.org');
$options->allowed_urls = array();
// URLs to block
// ----------------------
// List of URLs (or parts of a URL) which the service will not accept.
// Note: this list is ignored if allowed_urls is not empty
$options->blocked_urls = array();
// Key holder(s) only?
// ----------------------
// Set this to true if you want to restrict access only to
// those with a key (see below to specify key(s)).
// If set to true, no feed is produced unless a valid
// key is provided.
$options->key_required = false;
// Favour item titles in feed
// ----------------------
// By default, when processing feeds, we assume item titles in the feed
// have not been truncated. So after processing web pages, the extracted titles
// are not used in the generated feed. If you prefer to have extracted titles in
// the feed you can either set this to false, in which case we will always favour
// extracted titles. Alternatively, if set to 'user' (default) we'll use the
// extracted title if you pass '&use_extracted_title' in the querystring.
// Possible values:
// * Favour feed titles: true
// * Favour extracted titles: false
// * Favour feed titles with user override: 'user' (default)
// Note: this has no effect when the input URL is to a web page - in these cases
// we always use the extracted title in the generated feed.
$options->favour_feed_titles = 'user';
// Access keys (password protected access)
// ------------------------------------
// NOTE: You do not need an API key from fivefilters.org to run your own
// copy of the code. This is here if you'd like to restrict access to
// _your_ copy.
// Keys let you group users - those with a key and those without - and
// restrict access to the service to those without a key.
// If you want everyone to access the service in the same way, you can
// leave the array below empty and ignore the access key options further down.
// The options further down let you control how the service should behave
// in each mode.
// Note: Explicitly including the index number (1 and 2 in the examples below)
// is highly recommended (when generating feeds, we encode the key and
// refer to it by index number and hash).
$options->api_keys = array();
// Example:
// $options->api_keys[1] = 'secret-key-1';
// $options->api_keys[2] = 'secret-key-2';
// Default entries (with access key)
// ----------------------
// The number of feed items to process when a valid access key is supplied.
$options->default_entries_with_key = 5;
// Max entries (with access key)
// ----------------------
// The maximum number of feed items to process when a valid access key is supplied.
$options->max_entries_with_key = 10;
/////////////////////////////////////////////////
/// ADVANCED OPTIONS ////////////////////////////
/////////////////////////////////////////////////
// Enable XSS filter?
// ----------------------
// We have not enabled this by default because we assume the majority of
// our users do not display the HTML retrieved by Full-Text RSS
// in a web page without further processing. If you subscribe to our generated
// feeds in your news reader application, it should, if it's good software, already
// filter the resulting HTML for XSS attacks, making it redundant for
// Full-Text RSS do the same. Similarly with frameworks/CMS which display
// feed content - the content should be treated like any other user-submitted content.
//
// If you are writing an application yourself which is processing feeds generated by
// Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks
// or enable this option. This might be useful if you are processing our generated
// feeds with JavaScript on the client side - although there's client side xss
// filtering available too, e.g. https://code.google.com/p/google-caja/wiki/JsHtmlSanitizer
//
// If enabled, we'll pass retrieved HTML content through htmLawed with
// safe flag on and style attributes denied, see
// http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawed_README.htm#s3.6
// Note: if enabled this will also remove certain elements you may want to preserve, such as iframes.
//
// Valid values:
// true - enabled, all content will be filtered
// 'user' (default) - user must pass &xss in makefulltextfeed.php querystring to enable
// false - disabled
$options->xss_filter = 'user';
// Allowed parsers
// ----------------------
// Full-Text RSS attempts to use PHP's libxml extension to process HTML.
// While fast, on some sites it may not always produce good results.
// For these sites, you can specify an alternative HTML parser:
// parser: html5lib
// The html5lib parser is bundled with Full-Text RSS.
// see http://code.google.com/p/html5lib/
//
// To disable HTML parsing with html5lib, you can remove it from this list.
// By default we allow both: libxml and html5lib.
$options->allowed_parsers = array('libxml', 'html5lib');
//$options->allowed_parsers = array('libxml'); //disable html5lib - forcing libxml in all cases
// Enable Cross-Origin Resource Sharing (CORS)
// ----------------------
// If enabled we'll send the following HTTP header
// Access-Control-Allow-Origin: *
// see http://en.wikipedia.org/wiki/Cross-origin_resource_sharing
$options->cors = false;
// Use APC user cache?
// ----------------------
// If enabled we will store site config files (when requested
// for the first time) in APC's user cache. Keys prefixed with 'sc.'
// This improves performance by reducing disk access.
// Note: this has no effect if APC is unavailable on your server.
$options->apc = true;
// Smart cache (experimental)
// ----------------------
// With this option enabled we will not cache to disk immediately.
// We will store the cache key in APC and if it's requested again
// we will cache results to disk. Keys prefixed with 'cache.'
// This improves performance by reducing disk access.
// Note: this has no effect if APC is disabled or unavailable on your server,
// or if you have caching disabled.
$options->smart_cache = true;
// Fingerprints
// ----------------------
// key is fingerprint (fragment to find in HTML)
// value is host name to use for site config lookup if fingerprint matches
$options->fingerprints = array(
// Posterous
'<meta name="generator" content="Posterous"' => array('hostname'=>'fingerprint.posterous.com', 'head'=>true),
// Blogger
'<meta content=\'blogger\' name=\'generator\'' => array('hostname'=>'fingerprint.blogspot.com', 'head'=>true),
'<meta name="generator" content="Blogger"' => array('hostname'=>'fingerprint.blogspot.com', 'head'=>true),
// WordPress (hosted)
// '<meta name="generator" content="WordPress.com"' => array('hostname'=>'fingerprint.wordpress.com', 'head'=>true),
// WordPress (self-hosted and hosted)
'<meta name="generator" content="WordPress' => array('hostname'=>'fingerprint.wordpress.com', 'head'=>true)
);
// User Agent strings - mapping domain names
// ----------------------
// e.g. $options->user_agents = array('example.org' => 'PHP/5.2');
$options->user_agents = array( 'lifehacker.com' => 'PHP/5.2',
'gawker.com' => 'PHP/5.2',
'deadspin.com' => 'PHP/5.2',
'kotaku.com' => 'PHP/5.2',
'jezebel.com' => 'PHP/5.2',
'io9.com' => 'PHP/5.2',
'jalopnik.com' => 'PHP/5.2',
'gizmodo.com' => 'PHP/5.2',
'.wikipedia.org' => 'Mozilla/5.2',
'.fok.nl' => 'Googlebot/2.1',
'getpocket.com' => 'PHP/5.2'
);
// URL Rewriting
// ----------------------
// Currently allows simple string replace of URLs.
// Useful for rewriting certain URLs to point to a single page
// or HTML view. Although using the single_page_link site config
// instruction is the preferred way to do this, sometimes, as
// with Google Docs URLs, it's not possible.
// Note: this might move to the site config file at some point.
$options->rewrite_url = array(
// Rewrite public Google Docs URLs to point to HTML view:
// if a URL contains docs.google.com, replace /Doc? with /View?
'docs.google.com' => array('/Doc?' => '/View?'),
'tnr.com' => array('tnr.com/article/' => 'tnr.com/print/article/'),
'.m.wikipedia.org' => array('.m.wikipedia.org' => '.wikipedia.org'),
'm.vanityfair.com' => array('m.vanityfair.com' => 'www.vanityfair.com')
);
// Content-Type exceptions
// -----------------------
// Here you can define different actions based
// on the Content-Type header returned by server.
// MIME type as key, action as value.
// Valid actions:
// * 'exclude' - exclude this item from the result
// * 'link' - create HTML link to the item
$options->content_type_exc = array(
'application/pdf' => array('action'=>'link', 'name'=>'PDF'),
'image' => array('action'=>'link', 'name'=>'Image'),
'audio' => array('action'=>'link', 'name'=>'Audio'),
'video' => array('action'=>'link', 'name'=>'Video')
);
// Cache directory level
// ----------------------
// Spread cache files over different directories (only used if caching is enabled).
// Used to prevent large number of files in one directory.
// This corresponds to Zend_Cache's hashed_directory_level
// see http://framework.zend.com/manual/en/zend.cache.backends.html
// It's best not to change this if you're unsure.
$options->cache_directory_level = 0;
// Cache cleanup
// -------------
// 0 = script will not clean cache (rename cachecleanup.php and use it for scheduled (e.g. cron) cache cleanup)
// 1 = clean cache everytime the script runs (not recommended)
// 100 = clean cache roughly once every 100 script runs
// x = clean cache roughly once every x script runs
// ...you get the idea :)
$options->cache_cleanup = 100;
/////////////////////////////////////////////////
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
/////////////////////////////////////////////////
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.2');
if (basename(__FILE__) == 'config.php') {
if (file_exists(dirname(__FILE__).'/custom_config.php')) {
require_once dirname(__FILE__).'/custom_config.php';
}
// check for environment variables - often used on cloud platforms
// environment variables should be prefixed with 'ftr_', e.g.
// ftr_max_entries: 1
// will set the max_entries value to 1.
foreach ($options as $_key=>&$_val) {
$_key = "ftr_$_key";
if (($_env = getenv($_key)) !== false) {
if (is_array($_val)) {
if ($_key === 'ftr_admin_credentials') {
$_val = array_combine(array('username', 'password'), array_map('trim', explode(':', $_env, 2)));
if ($_val === false) $_val = array('username'=>'admin', 'password'=>'');
}
} elseif ($_env === 'true' || $_env === 'false') {
$_val = ($_env === 'true');
} elseif (is_numeric($_env)) {
$_val = (int)$_env;
} else { // string
$_val = $_env;
}
}
}
unset($_key, $_val, $_env);
}

View file

@ -1,250 +0,0 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Cache
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Cache.php 24656 2012-02-26 06:02:53Z adamlundrigan $
*/
/**
* @package Zend_Cache
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Cache
{
/**
* Standard frontends
*
* @var array
*/
public static $standardFrontends = array('Core', 'Output', 'Class', 'File', 'Function', 'Page');
/**
* Standard backends
*
* @var array
*/
public static $standardBackends = array('File', 'Sqlite', 'Memcached', 'Libmemcached', 'Apc', 'ZendPlatform',
'Xcache', 'TwoLevels', 'WinCache', 'ZendServer_Disk', 'ZendServer_ShMem');
/**
* Standard backends which implement the ExtendedInterface
*
* @var array
*/
public static $standardExtendedBackends = array('File', 'Apc', 'TwoLevels', 'Memcached', 'Libmemcached', 'Sqlite', 'WinCache');
/**
* Only for backward compatibility (may be removed in next major release)
*
* @var array
* @deprecated
*/
public static $availableFrontends = array('Core', 'Output', 'Class', 'File', 'Function', 'Page');
/**
* Only for backward compatibility (may be removed in next major release)
*
* @var array
* @deprecated
*/
public static $availableBackends = array('File', 'Sqlite', 'Memcached', 'Libmemcached', 'Apc', 'ZendPlatform', 'Xcache', 'WinCache', 'TwoLevels');
/**
* Consts for clean() method
*/
const CLEANING_MODE_ALL = 'all';
const CLEANING_MODE_OLD = 'old';
const CLEANING_MODE_MATCHING_TAG = 'matchingTag';
const CLEANING_MODE_NOT_MATCHING_TAG = 'notMatchingTag';
const CLEANING_MODE_MATCHING_ANY_TAG = 'matchingAnyTag';
/**
* Factory
*
* @param mixed $frontend frontend name (string) or Zend_Cache_Frontend_ object
* @param mixed $backend backend name (string) or Zend_Cache_Backend_ object
* @param array $frontendOptions associative array of options for the corresponding frontend constructor
* @param array $backendOptions associative array of options for the corresponding backend constructor
* @param boolean $customFrontendNaming if true, the frontend argument is used as a complete class name ; if false, the frontend argument is used as the end of "Zend_Cache_Frontend_[...]" class name
* @param boolean $customBackendNaming if true, the backend argument is used as a complete class name ; if false, the backend argument is used as the end of "Zend_Cache_Backend_[...]" class name
* @param boolean $autoload if true, there will no require_once for backend and frontend (useful only for custom backends/frontends)
* @throws Zend_Cache_Exception
* @return Zend_Cache_Core|Zend_Cache_Frontend
*/
public static function factory($frontend, $backend, $frontendOptions = array(), $backendOptions = array(), $customFrontendNaming = false, $customBackendNaming = false, $autoload = false)
{
if (is_string($backend)) {
$backendObject = self::_makeBackend($backend, $backendOptions, $customBackendNaming, $autoload);
} else {
if ((is_object($backend)) && (in_array('Zend_Cache_Backend_Interface', class_implements($backend)))) {
$backendObject = $backend;
} else {
self::throwException('backend must be a backend name (string) or an object which implements Zend_Cache_Backend_Interface');
}
}
if (is_string($frontend)) {
$frontendObject = self::_makeFrontend($frontend, $frontendOptions, $customFrontendNaming, $autoload);
} else {
if (is_object($frontend)) {
$frontendObject = $frontend;
} else {
self::throwException('frontend must be a frontend name (string) or an object');
}
}
$frontendObject->setBackend($backendObject);
return $frontendObject;
}
/**
* Backend Constructor
*
* @param string $backend
* @param array $backendOptions
* @param boolean $customBackendNaming
* @param boolean $autoload
* @return Zend_Cache_Backend
*/
public static function _makeBackend($backend, $backendOptions, $customBackendNaming = false, $autoload = false)
{
if (!$customBackendNaming) {
$backend = self::_normalizeName($backend);
}
if (in_array($backend, Zend_Cache::$standardBackends)) {
// we use a standard backend
$backendClass = 'Zend_Cache_Backend_' . $backend;
// security controls are explicit
require_once realpath(dirname(__FILE__).'/..').DIRECTORY_SEPARATOR.str_replace('_', DIRECTORY_SEPARATOR, $backendClass) . '.php';
} else {
// we use a custom backend
if (!preg_match('~^[\w\\\\]+$~D', $backend)) {
Zend_Cache::throwException("Invalid backend name [$backend]");
}
if (!$customBackendNaming) {
// we use this boolean to avoid an API break
$backendClass = 'Zend_Cache_Backend_' . $backend;
} else {
$backendClass = $backend;
}
if (!$autoload) {
$file = str_replace('_', DIRECTORY_SEPARATOR, $backendClass) . '.php';
if (!(self::_isReadable($file))) {
self::throwException("file $file not found in include_path");
}
require_once $file;
}
}
return new $backendClass($backendOptions);
}
/**
* Frontend Constructor
*
* @param string $frontend
* @param array $frontendOptions
* @param boolean $customFrontendNaming
* @param boolean $autoload
* @return Zend_Cache_Core|Zend_Cache_Frontend
*/
public static function _makeFrontend($frontend, $frontendOptions = array(), $customFrontendNaming = false, $autoload = false)
{
if (!$customFrontendNaming) {
$frontend = self::_normalizeName($frontend);
}
if (in_array($frontend, self::$standardFrontends)) {
// we use a standard frontend
// For perfs reasons, with frontend == 'Core', we can interact with the Core itself
$frontendClass = 'Zend_Cache_' . ($frontend != 'Core' ? 'Frontend_' : '') . $frontend;
// security controls are explicit
require_once realpath(dirname(__FILE__).'/..').DIRECTORY_SEPARATOR.str_replace('_', DIRECTORY_SEPARATOR, $frontendClass) . '.php';
} else {
// we use a custom frontend
if (!preg_match('~^[\w\\\\]+$~D', $frontend)) {
Zend_Cache::throwException("Invalid frontend name [$frontend]");
}
if (!$customFrontendNaming) {
// we use this boolean to avoid an API break
$frontendClass = 'Zend_Cache_Frontend_' . $frontend;
} else {
$frontendClass = $frontend;
}
if (!$autoload) {
$file = str_replace('_', DIRECTORY_SEPARATOR, $frontendClass) . '.php';
if (!(self::_isReadable($file))) {
self::throwException("file $file not found in include_path");
}
require_once $file;
}
}
return new $frontendClass($frontendOptions);
}
/**
* Throw an exception
*
* Note : for perf reasons, the "load" of Zend/Cache/Exception is dynamic
* @param string $msg Message for the exception
* @throws Zend_Cache_Exception
*/
public static function throwException($msg, Exception $e = null)
{
// For perfs reasons, we use this dynamic inclusion
require_once 'Zend/Cache/Exception.php';
throw new Zend_Cache_Exception($msg, 0, $e);
}
/**
* Normalize frontend and backend names to allow multiple words TitleCased
*
* @param string $name Name to normalize
* @return string
*/
protected static function _normalizeName($name)
{
$name = ucfirst(strtolower($name));
$name = str_replace(array('-', '_', '.'), ' ', $name);
$name = ucwords($name);
$name = str_replace(' ', '', $name);
if (stripos($name, 'ZendServer') === 0) {
$name = 'ZendServer_' . substr($name, strlen('ZendServer'));
}
return $name;
}
/**
* Returns TRUE if the $filename is readable, or FALSE otherwise.
* This function uses the PHP include_path, where PHP's is_readable()
* does not.
*
* Note : this method comes from Zend_Loader (see #ZF-2891 for details)
*
* @param string $filename
* @return boolean
*/
private static function _isReadable($filename)
{
if (!$fh = @fopen($filename, 'r', true)) {
return false;
}
@fclose($fh);
return true;
}
}

View file

@ -1,290 +0,0 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Cache
* @subpackage Zend_Cache_Backend
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Backend.php 24989 2012-06-21 07:24:13Z mabe $
*/
/**
* @package Zend_Cache
* @subpackage Zend_Cache_Backend
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Cache_Backend
{
/**
* Frontend or Core directives
*
* =====> (int) lifetime :
* - Cache lifetime (in seconds)
* - If null, the cache is valid forever
*
* =====> (int) logging :
* - if set to true, a logging is activated throw Zend_Log
*
* @var array directives
*/
protected $_directives = array(
'lifetime' => 3600,
'logging' => false,
'logger' => null
);
/**
* Available options
*
* @var array available options
*/
protected $_options = array();
/**
* Constructor
*
* @param array $options Associative array of options
* @throws Zend_Cache_Exception
* @return void
*/
public function __construct(array $options = array())
{
while (list($name, $value) = each($options)) {
$this->setOption($name, $value);
}
}
/**
* Set the frontend directives
*
* @param array $directives Assoc of directives
* @throws Zend_Cache_Exception
* @return void
*/
public function setDirectives($directives)
{
if (!is_array($directives)) Zend_Cache::throwException('Directives parameter must be an array');
while (list($name, $value) = each($directives)) {
if (!is_string($name)) {
Zend_Cache::throwException("Incorrect option name : $name");
}
$name = strtolower($name);
if (array_key_exists($name, $this->_directives)) {
$this->_directives[$name] = $value;
}
}
$this->_loggerSanity();
}
/**
* Set an option
*
* @param string $name
* @param mixed $value
* @throws Zend_Cache_Exception
* @return void
*/
public function setOption($name, $value)
{
if (!is_string($name)) {
Zend_Cache::throwException("Incorrect option name : $name");
}
$name = strtolower($name);
if (array_key_exists($name, $this->_options)) {
$this->_options[$name] = $value;
}
}
/**
* Returns an option
*
* @param string $name Optional, the options name to return
* @throws Zend_Cache_Exceptions
* @return mixed
*/
public function getOption($name)
{
$name = strtolower($name);
if (array_key_exists($name, $this->_options)) {
return $this->_options[$name];
}
if (array_key_exists($name, $this->_directives)) {
return $this->_directives[$name];
}
Zend_Cache::throwException("Incorrect option name : {$name}");
}
/**
* Get the life time
*
* if $specificLifetime is not false, the given specific life time is used
* else, the global lifetime is used
*
* @param int $specificLifetime
* @return int Cache life time
*/
public function getLifetime($specificLifetime)
{
if ($specificLifetime === false) {
return $this->_directives['lifetime'];
}
return $specificLifetime;
}
/**
* Return true if the automatic cleaning is available for the backend
*
* DEPRECATED : use getCapabilities() instead
*
* @deprecated
* @return boolean
*/
public function isAutomaticCleaningAvailable()
{
return true;
}
/**
* Determine system TMP directory and detect if we have read access
*
* inspired from Zend_File_Transfer_Adapter_Abstract
*
* @return string
* @throws Zend_Cache_Exception if unable to determine directory
*/
public function getTmpDir()
{
$tmpdir = array();
foreach (array($_ENV, $_SERVER) as $tab) {
foreach (array('TMPDIR', 'TEMP', 'TMP', 'windir', 'SystemRoot') as $key) {
if (isset($tab[$key]) && is_string($tab[$key])) {
if (($key == 'windir') or ($key == 'SystemRoot')) {
$dir = realpath($tab[$key] . '\\temp');
} else {
$dir = realpath($tab[$key]);
}
if ($this->_isGoodTmpDir($dir)) {
return $dir;
}
}
}
}
$upload = ini_get('upload_tmp_dir');
if ($upload) {
$dir = realpath($upload);
if ($this->_isGoodTmpDir($dir)) {
return $dir;
}
}
if (function_exists('sys_get_temp_dir')) {
$dir = sys_get_temp_dir();
if ($this->_isGoodTmpDir($dir)) {
return $dir;
}
}
// Attemp to detect by creating a temporary file
$tempFile = tempnam(md5(uniqid(rand(), TRUE)), '');
if ($tempFile) {
$dir = realpath(dirname($tempFile));
unlink($tempFile);
if ($this->_isGoodTmpDir($dir)) {
return $dir;
}
}
if ($this->_isGoodTmpDir('/tmp')) {
return '/tmp';
}
if ($this->_isGoodTmpDir('\\temp')) {
return '\\temp';
}
Zend_Cache::throwException('Could not determine temp directory, please specify a cache_dir manually');
}
/**
* Verify if the given temporary directory is readable and writable
*
* @param string $dir temporary directory
* @return boolean true if the directory is ok
*/
protected function _isGoodTmpDir($dir)
{
if (is_readable($dir)) {
if (is_writable($dir)) {
return true;
}
}
return false;
}
/**
* Make sure if we enable logging that the Zend_Log class
* is available.
* Create a default log object if none is set.
*
* @throws Zend_Cache_Exception
* @return void
*/
protected function _loggerSanity()
{
if (!isset($this->_directives['logging']) || !$this->_directives['logging']) {
return;
}
if (isset($this->_directives['logger'])) {
if ($this->_directives['logger'] instanceof Zend_Log) {
return;
}
Zend_Cache::throwException('Logger object is not an instance of Zend_Log class.');
}
// Create a default logger to the standard output stream
require_once 'Zend/Log.php';
require_once 'Zend/Log/Writer/Stream.php';
require_once 'Zend/Log/Filter/Priority.php';
$logger = new Zend_Log(new Zend_Log_Writer_Stream('php://output'));
$logger->addFilter(new Zend_Log_Filter_Priority(Zend_Log::WARN, '<='));
$this->_directives['logger'] = $logger;
}
/**
* Log a message at the WARN (4) priority.
*
* @param string $message
* @throws Zend_Cache_Exception
* @return void
*/
protected function _log($message, $priority = 4)
{
if (!$this->_directives['logging']) {
return;
}
if (!isset($this->_directives['logger'])) {
Zend_Cache::throwException('Logging is enabled but logger is not set.');
}
$logger = $this->_directives['logger'];
if (!$logger instanceof Zend_Log) {
Zend_Cache::throwException('Logger object is not an instance of Zend_Log class.');
}
$logger->log($message, $priority);
}
}

View file

@ -1,127 +0,0 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Cache
* @subpackage Zend_Cache_Backend
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: ExtendedInterface.php 24593 2012-01-05 20:35:02Z matthew $
*/
/**
* @see Zend_Cache_Backend_Interface
*/
//require_once 'Zend/Cache/Backend/Interface.php';
require_once dirname(__FILE__).'/Interface.php';
/**
* @package Zend_Cache
* @subpackage Zend_Cache_Backend
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
interface Zend_Cache_Backend_ExtendedInterface extends Zend_Cache_Backend_Interface
{
/**
* Return an array of stored cache ids
*
* @return array array of stored cache ids (string)
*/
public function getIds();
/**
* Return an array of stored tags
*
* @return array array of stored tags (string)
*/
public function getTags();
/**
* Return an array of stored cache ids which match given tags
*
* In case of multiple tags, a logical AND is made between tags
*
* @param array $tags array of tags
* @return array array of matching cache ids (string)
*/
public function getIdsMatchingTags($tags = array());
/**
* Return an array of stored cache ids which don't match given tags
*
* In case of multiple tags, a logical OR is made between tags
*
* @param array $tags array of tags
* @return array array of not matching cache ids (string)
*/
public function getIdsNotMatchingTags($tags = array());
/**
* Return an array of stored cache ids which match any given tags
*
* In case of multiple tags, a logical AND is made between tags
*
* @param array $tags array of tags
* @return array array of any matching cache ids (string)
*/
public function getIdsMatchingAnyTags($tags = array());
/**
* Return the filling percentage of the backend storage
*
* @return int integer between 0 and 100
*/
public function getFillingPercentage();
/**
* Return an array of metadatas for the given cache id
*
* The array must include these keys :
* - expire : the expire timestamp
* - tags : a string array of tags
* - mtime : timestamp of last modification time
*
* @param string $id cache id
* @return array array of metadatas (false if the cache id is not found)
*/
public function getMetadatas($id);
/**
* Give (if possible) an extra lifetime to the given cache id
*
* @param string $id cache id
* @param int $extraLifetime
* @return boolean true if ok
*/
public function touch($id, $extraLifetime);
/**
* Return an associative array of capabilities (booleans) of the backend
*
* The array must include these keys :
* - automatic_cleaning (is automating cleaning necessary)
* - tags (are tags supported)
* - expired_read (is it possible to read expired cache records
* (for doNotTestCacheValidity option for example))
* - priority does the backend deal with priority when saving
* - infinite_lifetime (is infinite lifetime can work with this backend)
* - get_list (is it possible to get the list of cache ids and the complete list of tags)
*
* @return array associative of with capabilities
*/
public function getCapabilities();
}

File diff suppressed because it is too large Load diff

View file

@ -1,99 +0,0 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Cache
* @subpackage Zend_Cache_Backend
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Interface.php 24593 2012-01-05 20:35:02Z matthew $
*/
/**
* @package Zend_Cache
* @subpackage Zend_Cache_Backend
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
interface Zend_Cache_Backend_Interface
{
/**
* Set the frontend directives
*
* @param array $directives assoc of directives
*/
public function setDirectives($directives);
/**
* Test if a cache is available for the given id and (if yes) return it (false else)
*
* Note : return value is always "string" (unserialization is done by the core not by the backend)
*
* @param string $id Cache id
* @param boolean $doNotTestCacheValidity If set to true, the cache validity won't be tested
* @return string|false cached datas
*/
public function load($id, $doNotTestCacheValidity = false);
/**
* Test if a cache is available or not (for the given id)
*
* @param string $id cache id
* @return mixed|false (a cache is not available) or "last modified" timestamp (int) of the available cache record
*/
public function test($id);
/**
* Save some string datas into a cache record
*
* Note : $data is always "string" (serialization is done by the
* core not by the backend)
*
* @param string $data Datas to cache
* @param string $id Cache id
* @param array $tags Array of strings, the cache record will be tagged by each string entry
* @param int $specificLifetime If != false, set a specific lifetime for this cache record (null => infinite lifetime)
* @return boolean true if no problem
*/
public function save($data, $id, $tags = array(), $specificLifetime = false);
/**
* Remove a cache record
*
* @param string $id Cache id
* @return boolean True if no problem
*/
public function remove($id);
/**
* Clean some cache records
*
* Available modes are :
* Zend_Cache::CLEANING_MODE_ALL (default) => remove all cache entries ($tags is not used)
* Zend_Cache::CLEANING_MODE_OLD => remove too old cache entries ($tags is not used)
* Zend_Cache::CLEANING_MODE_MATCHING_TAG => remove cache entries matching all given tags
* ($tags can be an array of strings or a single string)
* Zend_Cache::CLEANING_MODE_NOT_MATCHING_TAG => remove cache entries not {matching one of the given tags}
* ($tags can be an array of strings or a single string)
* Zend_Cache::CLEANING_MODE_MATCHING_ANY_TAG => remove cache entries matching any given tags
* ($tags can be an array of strings or a single string)
*
* @param string $mode Clean mode
* @param array $tags Array of tags
* @return boolean true if no problem
*/
public function clean($mode = Zend_Cache::CLEANING_MODE_ALL, $tags = array());
}

View file

@ -1,765 +0,0 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Cache
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Core.php 24989 2012-06-21 07:24:13Z mabe $
*/
/**
* @package Zend_Cache
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Cache_Core
{
/**
* Messages
*/
const BACKEND_NOT_SUPPORTS_TAG = 'tags are not supported by the current backend';
const BACKEND_NOT_IMPLEMENTS_EXTENDED_IF = 'Current backend doesn\'t implement the Zend_Cache_Backend_ExtendedInterface, so this method is not available';
/**
* Backend Object
*
* @var Zend_Cache_Backend_Interface $_backend
*/
protected $_backend = null;
/**
* Available options
*
* ====> (boolean) write_control :
* - Enable / disable write control (the cache is read just after writing to detect corrupt entries)
* - Enable write control will lightly slow the cache writing but not the cache reading
* Write control can detect some corrupt cache files but maybe it's not a perfect control
*
* ====> (boolean) caching :
* - Enable / disable caching
* (can be very useful for the debug of cached scripts)
*
* =====> (string) cache_id_prefix :
* - prefix for cache ids (namespace)
*
* ====> (boolean) automatic_serialization :
* - Enable / disable automatic serialization
* - It can be used to save directly datas which aren't strings (but it's slower)
*
* ====> (int) automatic_cleaning_factor :
* - Disable / Tune the automatic cleaning process
* - The automatic cleaning process destroy too old (for the given life time)
* cache files when a new cache file is written :
* 0 => no automatic cache cleaning
* 1 => systematic cache cleaning
* x (integer) > 1 => automatic cleaning randomly 1 times on x cache write
*
* ====> (int) lifetime :
* - Cache lifetime (in seconds)
* - If null, the cache is valid forever.
*
* ====> (boolean) logging :
* - If set to true, logging is activated (but the system is slower)
*
* ====> (boolean) ignore_user_abort
* - If set to true, the core will set the ignore_user_abort PHP flag inside the
* save() method to avoid cache corruptions in some cases (default false)
*
* @var array $_options available options
*/
protected $_options = array(
'write_control' => true,
'caching' => true,
'cache_id_prefix' => null,
'automatic_serialization' => false,
'automatic_cleaning_factor' => 10,
'lifetime' => 3600,
'logging' => false,
'logger' => null,
'ignore_user_abort' => false
);
/**
* Array of options which have to be transfered to backend
*
* @var array $_directivesList
*/
protected static $_directivesList = array('lifetime', 'logging', 'logger');
/**
* Not used for the core, just a sort a hint to get a common setOption() method (for the core and for frontends)
*
* @var array $_specificOptions
*/
protected $_specificOptions = array();
/**
* Last used cache id
*
* @var string $_lastId
*/
private $_lastId = null;
/**
* True if the backend implements Zend_Cache_Backend_ExtendedInterface
*
* @var boolean $_extendedBackend
*/
protected $_extendedBackend = false;
/**
* Array of capabilities of the backend (only if it implements Zend_Cache_Backend_ExtendedInterface)
*
* @var array
*/
protected $_backendCapabilities = array();
/**
* Constructor
*
* @param array|Zend_Config $options Associative array of options or Zend_Config instance
* @throws Zend_Cache_Exception
* @return void
*/
public function __construct($options = array())
{
if ($options instanceof Zend_Config) {
$options = $options->toArray();
}
if (!is_array($options)) {
Zend_Cache::throwException("Options passed were not an array"
. " or Zend_Config instance.");
}
while (list($name, $value) = each($options)) {
$this->setOption($name, $value);
}
$this->_loggerSanity();
}
/**
* Set options using an instance of type Zend_Config
*
* @param Zend_Config $config
* @return Zend_Cache_Core
*/
public function setConfig(Zend_Config $config)
{
$options = $config->toArray();
while (list($name, $value) = each($options)) {
$this->setOption($name, $value);
}
return $this;
}
/**
* Set the backend
*
* @param Zend_Cache_Backend $backendObject
* @throws Zend_Cache_Exception
* @return void
*/
public function setBackend(Zend_Cache_Backend $backendObject)
{
$this->_backend= $backendObject;
// some options (listed in $_directivesList) have to be given
// to the backend too (even if they are not "backend specific")
$directives = array();
foreach (Zend_Cache_Core::$_directivesList as $directive) {
$directives[$directive] = $this->_options[$directive];
}
$this->_backend->setDirectives($directives);
if (in_array('Zend_Cache_Backend_ExtendedInterface', class_implements($this->_backend))) {
$this->_extendedBackend = true;
$this->_backendCapabilities = $this->_backend->getCapabilities();
}
}
/**
* Returns the backend
*
* @return Zend_Cache_Backend backend object
*/
public function getBackend()
{
return $this->_backend;
}
/**
* Public frontend to set an option
*
* There is an additional validation (relatively to the protected _setOption method)
*
* @param string $name Name of the option
* @param mixed $value Value of the option
* @throws Zend_Cache_Exception
* @return void
*/
public function setOption($name, $value)
{
if (!is_string($name)) {
Zend_Cache::throwException("Incorrect option name!");
}
$name = strtolower($name);
if (array_key_exists($name, $this->_options)) {
// This is a Core option
$this->_setOption($name, $value);
return;
}
if (array_key_exists($name, $this->_specificOptions)) {
// This a specic option of this frontend
$this->_specificOptions[$name] = $value;
return;
}
}
/**
* Public frontend to get an option value
*
* @param string $name Name of the option
* @throws Zend_Cache_Exception
* @return mixed option value
*/
public function getOption($name)
{
$name = strtolower($name);
if (array_key_exists($name, $this->_options)) {
// This is a Core option
return $this->_options[$name];
}
if (array_key_exists($name, $this->_specificOptions)) {
// This a specic option of this frontend
return $this->_specificOptions[$name];
}
Zend_Cache::throwException("Incorrect option name : $name");
}
/**
* Set an option
*
* @param string $name Name of the option
* @param mixed $value Value of the option
* @throws Zend_Cache_Exception
* @return void
*/
private function _setOption($name, $value)
{
if (!is_string($name) || !array_key_exists($name, $this->_options)) {
Zend_Cache::throwException("Incorrect option name : $name");
}
if ($name == 'lifetime' && empty($value)) {
$value = null;
}
$this->_options[$name] = $value;
}
/**
* Force a new lifetime
*
* The new value is set for the core/frontend but for the backend too (directive)
*
* @param int $newLifetime New lifetime (in seconds)
* @return void
*/
public function setLifetime($newLifetime)
{
$this->_options['lifetime'] = $newLifetime;
$this->_backend->setDirectives(array(
'lifetime' => $newLifetime
));
}
/**
* Test if a cache is available for the given id and (if yes) return it (false else)
*
* @param string $id Cache id
* @param boolean $doNotTestCacheValidity If set to true, the cache validity won't be tested
* @param boolean $doNotUnserialize Do not serialize (even if automatic_serialization is true) => for internal use
* @return mixed|false Cached datas
*/
public function load($id, $doNotTestCacheValidity = false, $doNotUnserialize = false)
{
if (!$this->_options['caching']) {
return false;
}
$id = $this->_id($id); // cache id may need prefix
$this->_lastId = $id;
self::_validateIdOrTag($id);
$this->_log("Zend_Cache_Core: load item '{$id}'", 7);
$data = $this->_backend->load($id, $doNotTestCacheValidity);
if ($data===false) {
// no cache available
return false;
}
if ((!$doNotUnserialize) && $this->_options['automatic_serialization']) {
// we need to unserialize before sending the result
return unserialize($data);
}
return $data;
}
/**
* Test if a cache is available for the given id
*
* @param string $id Cache id
* @return int|false Last modified time of cache entry if it is available, false otherwise
*/
public function test($id)
{
if (!$this->_options['caching']) {
return false;
}
$id = $this->_id($id); // cache id may need prefix
self::_validateIdOrTag($id);
$this->_lastId = $id;
$this->_log("Zend_Cache_Core: test item '{$id}'", 7);
return $this->_backend->test($id);
}
/**
* Save some data in a cache
*
* @param mixed $data Data to put in cache (can be another type than string if automatic_serialization is on)
* @param string $id Cache id (if not set, the last cache id will be used)
* @param array $tags Cache tags
* @param int $specificLifetime If != false, set a specific lifetime for this cache record (null => infinite lifetime)
* @param int $priority integer between 0 (very low priority) and 10 (maximum priority) used by some particular backends
* @throws Zend_Cache_Exception
* @return boolean True if no problem
*/
public function save($data, $id = null, $tags = array(), $specificLifetime = false, $priority = 8)
{
if (!$this->_options['caching']) {
return true;
}
if ($id === null) {
$id = $this->_lastId;
} else {
$id = $this->_id($id);
}
self::_validateIdOrTag($id);
self::_validateTagsArray($tags);
if ($this->_options['automatic_serialization']) {
// we need to serialize datas before storing them
$data = serialize($data);
} else {
if (!is_string($data)) {
Zend_Cache::throwException("Datas must be string or set automatic_serialization = true");
}
}
// automatic cleaning
if ($this->_options['automatic_cleaning_factor'] > 0) {
$rand = rand(1, $this->_options['automatic_cleaning_factor']);
if ($rand==1) {
// new way || deprecated way
if ($this->_extendedBackend || method_exists($this->_backend, 'isAutomaticCleaningAvailable')) {
$this->_log("Zend_Cache_Core::save(): automatic cleaning running", 7);
$this->clean(Zend_Cache::CLEANING_MODE_OLD);
} else {
$this->_log("Zend_Cache_Core::save(): automatic cleaning is not available/necessary with current backend", 4);
}
}
}
$this->_log("Zend_Cache_Core: save item '{$id}'", 7);
if ($this->_options['ignore_user_abort']) {
$abort = ignore_user_abort(true);
}
if (($this->_extendedBackend) && ($this->_backendCapabilities['priority'])) {
$result = $this->_backend->save($data, $id, $tags, $specificLifetime, $priority);
} else {
$result = $this->_backend->save($data, $id, $tags, $specificLifetime);
}
if ($this->_options['ignore_user_abort']) {
ignore_user_abort($abort);
}
if (!$result) {
// maybe the cache is corrupted, so we remove it !
$this->_log("Zend_Cache_Core::save(): failed to save item '{$id}' -> removing it", 4);
$this->_backend->remove($id);
return false;
}
if ($this->_options['write_control']) {
$data2 = $this->_backend->load($id, true);
if ($data!=$data2) {
$this->_log("Zend_Cache_Core::save(): write control of item '{$id}' failed -> removing it", 4);
$this->_backend->remove($id);
return false;
}
}
return true;
}
/**
* Remove a cache
*
* @param string $id Cache id to remove
* @return boolean True if ok
*/
public function remove($id)
{
if (!$this->_options['caching']) {
return true;
}
$id = $this->_id($id); // cache id may need prefix
self::_validateIdOrTag($id);
$this->_log("Zend_Cache_Core: remove item '{$id}'", 7);
return $this->_backend->remove($id);
}
/**
* Clean cache entries
*
* Available modes are :
* 'all' (default) => remove all cache entries ($tags is not used)
* 'old' => remove too old cache entries ($tags is not used)
* 'matchingTag' => remove cache entries matching all given tags
* ($tags can be an array of strings or a single string)
* 'notMatchingTag' => remove cache entries not matching one of the given tags
* ($tags can be an array of strings or a single string)
* 'matchingAnyTag' => remove cache entries matching any given tags
* ($tags can be an array of strings or a single string)
*
* @param string $mode
* @param array|string $tags
* @throws Zend_Cache_Exception
* @return boolean True if ok
*/
public function clean($mode = 'all', $tags = array())
{
if (!$this->_options['caching']) {
return true;
}
if (!in_array($mode, array(Zend_Cache::CLEANING_MODE_ALL,
Zend_Cache::CLEANING_MODE_OLD,
Zend_Cache::CLEANING_MODE_MATCHING_TAG,
Zend_Cache::CLEANING_MODE_NOT_MATCHING_TAG,
Zend_Cache::CLEANING_MODE_MATCHING_ANY_TAG))) {
Zend_Cache::throwException('Invalid cleaning mode');
}
self::_validateTagsArray($tags);
return $this->_backend->clean($mode, $tags);
}
/**
* Return an array of stored cache ids which match given tags
*
* In case of multiple tags, a logical AND is made between tags
*
* @param array $tags array of tags
* @return array array of matching cache ids (string)
*/
public function getIdsMatchingTags($tags = array())
{
if (!$this->_extendedBackend) {
Zend_Cache::throwException(self::BACKEND_NOT_IMPLEMENTS_EXTENDED_IF);
}
if (!($this->_backendCapabilities['tags'])) {
Zend_Cache::throwException(self::BACKEND_NOT_SUPPORTS_TAG);
}
$ids = $this->_backend->getIdsMatchingTags($tags);
// we need to remove cache_id_prefix from ids (see #ZF-6178, #ZF-7600)
if (isset($this->_options['cache_id_prefix']) && $this->_options['cache_id_prefix'] !== '') {
$prefix = & $this->_options['cache_id_prefix'];
$prefixLen = strlen($prefix);
foreach ($ids as &$id) {
if (strpos($id, $prefix) === 0) {
$id = substr($id, $prefixLen);
}
}
}
return $ids;
}
/**
* Return an array of stored cache ids which don't match given tags
*
* In case of multiple tags, a logical OR is made between tags
*
* @param array $tags array of tags
* @return array array of not matching cache ids (string)
*/
public function getIdsNotMatchingTags($tags = array())
{
if (!$this->_extendedBackend) {
Zend_Cache::throwException(self::BACKEND_NOT_IMPLEMENTS_EXTENDED_IF);
}
if (!($this->_backendCapabilities['tags'])) {
Zend_Cache::throwException(self::BACKEND_NOT_SUPPORTS_TAG);
}
$ids = $this->_backend->getIdsNotMatchingTags($tags);
// we need to remove cache_id_prefix from ids (see #ZF-6178, #ZF-7600)
if (isset($this->_options['cache_id_prefix']) && $this->_options['cache_id_prefix'] !== '') {
$prefix = & $this->_options['cache_id_prefix'];
$prefixLen = strlen($prefix);
foreach ($ids as &$id) {
if (strpos($id, $prefix) === 0) {
$id = substr($id, $prefixLen);
}
}
}
return $ids;
}
/**
* Return an array of stored cache ids which match any given tags
*
* In case of multiple tags, a logical OR is made between tags
*
* @param array $tags array of tags
* @return array array of matching any cache ids (string)
*/
public function getIdsMatchingAnyTags($tags = array())
{
if (!$this->_extendedBackend) {
Zend_Cache::throwException(self::BACKEND_NOT_IMPLEMENTS_EXTENDED_IF);
}
if (!($this->_backendCapabilities['tags'])) {
Zend_Cache::throwException(self::BACKEND_NOT_SUPPORTS_TAG);
}
$ids = $this->_backend->getIdsMatchingAnyTags($tags);
// we need to remove cache_id_prefix from ids (see #ZF-6178, #ZF-7600)
if (isset($this->_options['cache_id_prefix']) && $this->_options['cache_id_prefix'] !== '') {
$prefix = & $this->_options['cache_id_prefix'];
$prefixLen = strlen($prefix);
foreach ($ids as &$id) {
if (strpos($id, $prefix) === 0) {
$id = substr($id, $prefixLen);
}
}
}
return $ids;
}
/**
* Return an array of stored cache ids
*
* @return array array of stored cache ids (string)
*/
public function getIds()
{
if (!$this->_extendedBackend) {
Zend_Cache::throwException(self::BACKEND_NOT_IMPLEMENTS_EXTENDED_IF);
}
$ids = $this->_backend->getIds();
// we need to remove cache_id_prefix from ids (see #ZF-6178, #ZF-7600)
if (isset($this->_options['cache_id_prefix']) && $this->_options['cache_id_prefix'] !== '') {
$prefix = & $this->_options['cache_id_prefix'];
$prefixLen = strlen($prefix);
foreach ($ids as &$id) {
if (strpos($id, $prefix) === 0) {
$id = substr($id, $prefixLen);
}
}
}
return $ids;
}
/**
* Return an array of stored tags
*
* @return array array of stored tags (string)
*/
public function getTags()
{
if (!$this->_extendedBackend) {
Zend_Cache::throwException(self::BACKEND_NOT_IMPLEMENTS_EXTENDED_IF);
}
if (!($this->_backendCapabilities['tags'])) {
Zend_Cache::throwException(self::BACKEND_NOT_SUPPORTS_TAG);
}
return $this->_backend->getTags();
}
/**
* Return the filling percentage of the backend storage
*
* @return int integer between 0 and 100
*/
public function getFillingPercentage()
{
if (!$this->_extendedBackend) {
Zend_Cache::throwException(self::BACKEND_NOT_IMPLEMENTS_EXTENDED_IF);
}
return $this->_backend->getFillingPercentage();
}
/**
* Return an array of metadatas for the given cache id
*
* The array will include these keys :
* - expire : the expire timestamp
* - tags : a string array of tags
* - mtime : timestamp of last modification time
*
* @param string $id cache id
* @return array array of metadatas (false if the cache id is not found)
*/
public function getMetadatas($id)
{
if (!$this->_extendedBackend) {
Zend_Cache::throwException(self::BACKEND_NOT_IMPLEMENTS_EXTENDED_IF);
}
$id = $this->_id($id); // cache id may need prefix
return $this->_backend->getMetadatas($id);
}
/**
* Give (if possible) an extra lifetime to the given cache id
*
* @param string $id cache id
* @param int $extraLifetime
* @return boolean true if ok
*/
public function touch($id, $extraLifetime)
{
if (!$this->_extendedBackend) {
Zend_Cache::throwException(self::BACKEND_NOT_IMPLEMENTS_EXTENDED_IF);
}
$id = $this->_id($id); // cache id may need prefix
$this->_log("Zend_Cache_Core: touch item '{$id}'", 7);
return $this->_backend->touch($id, $extraLifetime);
}
/**
* Validate a cache id or a tag (security, reliable filenames, reserved prefixes...)
*
* Throw an exception if a problem is found
*
* @param string $string Cache id or tag
* @throws Zend_Cache_Exception
* @return void
*/
protected static function _validateIdOrTag($string)
{
if (!is_string($string)) {
Zend_Cache::throwException('Invalid id or tag : must be a string');
}
if (substr($string, 0, 9) == 'internal-') {
Zend_Cache::throwException('"internal-*" ids or tags are reserved');
}
if (!preg_match('~^[a-zA-Z0-9_]+$~D', $string)) {
Zend_Cache::throwException("Invalid id or tag '$string' : must use only [a-zA-Z0-9_]");
}
}
/**
* Validate a tags array (security, reliable filenames, reserved prefixes...)
*
* Throw an exception if a problem is found
*
* @param array $tags Array of tags
* @throws Zend_Cache_Exception
* @return void
*/
protected static function _validateTagsArray($tags)
{
if (!is_array($tags)) {
Zend_Cache::throwException('Invalid tags array : must be an array');
}
foreach($tags as $tag) {
self::_validateIdOrTag($tag);
}
reset($tags);
}
/**
* Make sure if we enable logging that the Zend_Log class
* is available.
* Create a default log object if none is set.
*
* @throws Zend_Cache_Exception
* @return void
*/
protected function _loggerSanity()
{
if (!isset($this->_options['logging']) || !$this->_options['logging']) {
return;
}
if (isset($this->_options['logger']) && $this->_options['logger'] instanceof Zend_Log) {
return;
}
// Create a default logger to the standard output stream
require_once 'Zend/Log.php';
require_once 'Zend/Log/Writer/Stream.php';
require_once 'Zend/Log/Filter/Priority.php';
$logger = new Zend_Log(new Zend_Log_Writer_Stream('php://output'));
$logger->addFilter(new Zend_Log_Filter_Priority(Zend_Log::WARN, '<='));
$this->_options['logger'] = $logger;
}
/**
* Log a message at the WARN (4) priority.
*
* @param string $message
* @throws Zend_Cache_Exception
* @return void
*/
protected function _log($message, $priority = 4)
{
if (!$this->_options['logging']) {
return;
}
if (!(isset($this->_options['logger']) || $this->_options['logger'] instanceof Zend_Log)) {
Zend_Cache::throwException('Logging is enabled but logger is not set');
}
$logger = $this->_options['logger'];
$logger->log($message, $priority);
}
/**
* Make and return a cache id
*
* Checks 'cache_id_prefix' and returns new id with prefix or simply the id if null
*
* @param string $id Cache id
* @return string Cache id (with or without prefix)
*/
protected function _id($id)
{
if (($id !== null) && isset($this->_options['cache_id_prefix'])) {
return $this->_options['cache_id_prefix'] . $id; // return with prefix
}
return $id; // no prefix, just return the $id passed
}
}

View file

@ -1,32 +0,0 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Cache
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Exception.php 24593 2012-01-05 20:35:02Z matthew $
*/
/**
* @see Zend_Exception
*/
require_once 'Zend/Exception.php';
/**
* @package Zend_Cache
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Cache_Exception extends Zend_Exception {}

View file

@ -1,96 +0,0 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Exception.php 24593 2012-01-05 20:35:02Z matthew $
*/
/**
* @category Zend
* @package Zend
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Exception extends Exception
{
/**
* @var null|Exception
*/
private $_previous = null;
/**
* Construct the exception
*
* @param string $msg
* @param int $code
* @param Exception $previous
* @return void
*/
public function __construct($msg = '', $code = 0, Exception $previous = null)
{
if (version_compare(PHP_VERSION, '5.3.0', '<')) {
parent::__construct($msg, (int) $code);
$this->_previous = $previous;
} else {
parent::__construct($msg, (int) $code, $previous);
}
}
/**
* Overloading
*
* For PHP < 5.3.0, provides access to the getPrevious() method.
*
* @param string $method
* @param array $args
* @return mixed
*/
public function __call($method, array $args)
{
if ('getprevious' == strtolower($method)) {
return $this->_getPrevious();
}
return null;
}
/**
* String representation of the exception
*
* @return string
*/
public function __toString()
{
if (version_compare(PHP_VERSION, '5.3.0', '<')) {
if (null !== ($e = $this->getPrevious())) {
return $e->__toString()
. "\n\nNext "
. parent::__toString();
}
}
return parent::__toString();
}
/**
* Returns previous Exception
*
* @return Exception|null
*/
protected function _getPrevious()
{
return $this->_previous;
}
}

View file

@ -1,727 +0,0 @@
<?php
/**
* Content Extractor
*
* Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
* to extract content from HTML files.
*
* @version 1.0
* @date 2013-02-05
* @author Keyvan Minoukadeh
* @copyright 2013 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
class ContentExtractor
{
protected static $tidy_config = array(
'clean' => true,
'output-xhtml' => true,
'logical-emphasis' => true,
'show-body-only' => false,
'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
'new-inline-tags' => 'mark, time, meter, progress, data',
'wrap' => 0,
'drop-empty-paras' => true,
'drop-proprietary-attributes' => false,
'enclose-text' => true,
'enclose-block-text' => true,
'merge-divs' => true,
'merge-spans' => true,
'char-encoding' => 'utf8',
'hide-comments' => true
);
protected $html;
protected $config;
protected $title;
protected $author = array();
protected $language;
protected $date;
protected $body;
protected $success = false;
protected $nextPageUrl;
public $allowedParsers = array('libxml', 'html5lib');
public $fingerprints = array();
public $readability;
public $debug = false;
public $debugVerbose = false;
function __construct($path, $fallback=null) {
SiteConfig::set_config_path($path, $fallback);
}
protected function debug($msg) {
if ($this->debug) {
$mem = round(memory_get_usage()/1024, 2);
$memPeak = round(memory_get_peak_usage()/1024, 2);
echo '* ',$msg;
if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
echo "\n";
ob_flush();
flush();
}
}
public function reset() {
$this->html = null;
$this->readability = null;
$this->config = null;
$this->title = null;
$this->body = null;
$this->author = array();
$this->language = null;
$this->date = null;
$this->nextPageUrl = null;
$this->success = false;
}
public function findHostUsingFingerprints($html) {
$this->debug('Checking fingerprints...');
$head = substr($html, 0, 8000);
foreach ($this->fingerprints as $_fp => $_fphost) {
$lookin = 'html';
if (is_array($_fphost)) {
if (isset($_fphost['head']) && $_fphost['head']) {
$lookin = 'head';
}
$_fphost = $_fphost['hostname'];
}
if (strpos($$lookin, $_fp) !== false) {
$this->debug("Found match: $_fphost");
return $_fphost;
}
}
$this->debug('No fingerprint matches');
return false;
}
// returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default)
public function buildSiteConfig($url, $html='', $add_to_cache=true) {
// extract host name
$host = @parse_url($url, PHP_URL_HOST);
$host = strtolower($host);
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
// is merged version already cached?
if (SiteConfig::is_cached("$host.merged")) {
$this->debug("Returning cached and merged site config for $host");
return SiteConfig::build("$host.merged");
}
// let's build from site_config/custom/ and standard/
$config = SiteConfig::build($host);
if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) {
SiteConfig::add_to_cache($host, $config);
}
// if no match, use defaults
if (!$config) $config = new SiteConfig();
// load fingerprint config?
if ($config->autodetect_on_failure()) {
// check HTML for fingerprints
if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {
if ($config_fingerprint = SiteConfig::build($_fphost)) {
$this->debug("Appending site config settings from $_fphost (fingerprint match)");
$config->append($config_fingerprint);
if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
//$config_fingerprint->cache_in_apc = true;
SiteConfig::add_to_cache($_fphost, $config_fingerprint);
}
}
}
}
// load global config?
if ($config->autodetect_on_failure()) {
if ($config_global = SiteConfig::build('global', true)) {
$this->debug('Appending site config settings from global.txt');
$config->append($config_global);
if ($add_to_cache && !SiteConfig::is_cached('global')) {
//$config_global->cache_in_apc = true;
SiteConfig::add_to_cache('global', $config_global);
}
}
}
// store copy of merged config
if ($add_to_cache) {
// do not store in APC if wildcard match
$use_apc = ($host == $config->cache_key);
$config->cache_key = null;
SiteConfig::add_to_cache("$host.merged", $config, $use_apc);
}
return $config;
}
// returns true on success, false on failure
// $smart_tidy indicates that if tidy is used and no results are produced, we will
// try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time
// but it has problems of its own which we try to avoid with this option.
public function process($html, $url, $smart_tidy=true) {
$this->reset();
$this->config = $this->buildSiteConfig($url, $html);
// do string replacements
if (!empty($this->config->find_string)) {
if (count($this->config->find_string) == count($this->config->replace_string)) {
$html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count);
$this->debug("Strings replaced: $_count (find_string and/or replace_string)");
} else {
$this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');
}
unset($_count);
}
// use tidy (if it exists)?
// This fixes problems with some sites which would otherwise
// trouble DOMDocument's HTML parsing. (Although sometimes it
// makes matters worse, which is why you can override it in site config files.)
$tidied = false;
if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) {
$this->debug('Using Tidy');
$tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
if (tidy_clean_repair($tidy)) {
$original_html = $html;
$tidied = true;
$html = $tidy->value;
}
unset($tidy);
}
// load and parse html
$_parser = $this->config->parser();
if (!in_array($_parser, $this->allowedParsers)) {
$this->debug("HTML parser $_parser not listed, using libxml instead");
$_parser = 'libxml';
}
$this->debug("Attempting to parse HTML with $_parser");
$this->readability = new Readability($html, $url, $_parser);
// we use xpath to find elements in the given HTML document
// see http://en.wikipedia.org/wiki/XPath_1.0
$xpath = new DOMXPath($this->readability->dom);
// try to get next page link
foreach ($this->config->next_page_link as $pattern) {
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
if (is_string($elems)) {
$this->nextPageUrl = trim($elems);
break;
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
foreach ($elems as $item) {
if ($item instanceof DOMElement && $item->hasAttribute('href')) {
$this->nextPageUrl = $item->getAttribute('href');
break 2;
} elseif ($item instanceof DOMAttr && $item->value) {
$this->nextPageUrl = $item->value;
break 2;
}
}
}
}
// try to get title
foreach ($this->config->title as $pattern) {
// $this->debug("Trying $pattern");
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
if (is_string($elems)) {
$this->title = trim($elems);
$this->debug('Title expression evaluated as string: '.$this->title);
$this->debug("...XPath match: $pattern");
break;
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
$this->title = $elems->item(0)->textContent;
$this->debug('Title matched: '.$this->title);
$this->debug("...XPath match: $pattern");
// remove title from document
try {
@$elems->item(0)->parentNode->removeChild($elems->item(0));
} catch (DOMException $e) {
// do nothing
}
break;
}
}
// try to get author (if it hasn't already been set)
if (empty($this->author)) {
foreach ($this->config->author as $pattern) {
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
if (is_string($elems)) {
if (trim($elems) != '') {
$this->author[] = trim($elems);
$this->debug('Author expression evaluated as string: '.trim($elems));
$this->debug("...XPath match: $pattern");
break;
}
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
foreach ($elems as $elem) {
if (!isset($elem->parentNode)) continue;
$this->author[] = trim($elem->textContent);
$this->debug('Author matched: '.trim($elem->textContent));
}
if (!empty($this->author)) {
$this->debug("...XPath match: $pattern");
break;
}
}
}
}
// try to get language
$_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');
foreach ($_lang_xpath as $pattern) {
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
if (is_string($elems)) {
if (trim($elems) != '') {
$this->language = trim($elems);
$this->debug('Language matched: '.$this->language);
break;
}
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
foreach ($elems as $elem) {
if (!isset($elem->parentNode)) continue;
$this->language = trim($elem->textContent);
$this->debug('Language matched: '.$this->language);
}
if ($this->language) break;
}
}
// try to get date
foreach ($this->config->date as $pattern) {
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
if (is_string($elems)) {
$this->date = strtotime(trim($elems, "; \t\n\r\0\x0B"));
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
$this->date = $elems->item(0)->textContent;
$this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B"));
// remove date from document
// $elems->item(0)->parentNode->removeChild($elems->item(0));
}
if (!$this->date) {
$this->date = null;
} else {
$this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date));
$this->debug("...XPath match: $pattern");
break;
}
}
// strip elements (using xpath expressions)
foreach ($this->config->strip as $pattern) {
$elems = @$xpath->query($pattern, $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping '.$elems->length.' elements (strip)');
for ($i=$elems->length-1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
}
// strip elements (using id and class attribute values)
foreach ($this->config->strip_id_or_class as $string) {
$string = strtr($string, array("'"=>'', '"'=>''));
$elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)');
for ($i=$elems->length-1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
}
// strip images (using src attribute values)
foreach ($this->config->strip_image_src as $string) {
$string = strtr($string, array("'"=>'', '"'=>''));
$elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping '.$elems->length.' image elements');
for ($i=$elems->length-1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
}
// strip elements using Readability.com and Instapaper.com ignore class names
// .entry-unrelated and .instapaper_ignore
// See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
// and http://blog.instapaper.com/post/730281947
$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements');
for ($i=$elems->length-1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
// strip elements that contain style="display: none;"
$elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping '.$elems->length.' elements with inline display:none style');
for ($i=$elems->length-1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
// try to get body
foreach ($this->config->body as $pattern) {
$elems = @$xpath->query($pattern, $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Body matched');
$this->debug("...XPath match: $pattern");
if ($elems->length == 1) {
$this->body = $elems->item(0);
// prune (clean up elements that may not be content)
if ($this->config->prune()) {
$this->debug('...pruning content');
$this->readability->prepArticle($this->body);
}
break;
} else {
$this->body = $this->readability->dom->createElement('div');
$this->debug($elems->length.' body elems found');
foreach ($elems as $elem) {
if (!isset($elem->parentNode)) continue;
$isDescendant = false;
foreach ($this->body->childNodes as $parent) {
if ($this->isDescendant($parent, $elem)) {
$isDescendant = true;
break;
}
}
if ($isDescendant) {
$this->debug('...element is child of another body element, skipping.');
} else {
// prune (clean up elements that may not be content)
if ($this->config->prune()) {
$this->debug('Pruning content');
$this->readability->prepArticle($elem);
}
$this->debug('...element added to body');
$this->body->appendChild($elem);
}
}
if ($this->body->hasChildNodes()) break;
}
}
}
// auto detect?
$detect_title = $detect_body = $detect_author = $detect_date = false;
// detect title?
if (!isset($this->title)) {
if (empty($this->config->title) || $this->config->autodetect_on_failure()) {
$detect_title = true;
}
}
// detect body?
if (!isset($this->body)) {
if (empty($this->config->body) || $this->config->autodetect_on_failure()) {
$detect_body = true;
}
}
// detect author?
if (empty($this->author)) {
if (empty($this->config->author) || $this->config->autodetect_on_failure()) {
$detect_author = true;
}
}
// detect date?
if (!isset($this->date)) {
if (empty($this->config->date) || $this->config->autodetect_on_failure()) {
$detect_date = true;
}
}
// check for hNews
if ($detect_title || $detect_body) {
// check for hentry
$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
if ($elems && $elems->length > 0) {
$this->debug('hNews: found hentry');
$hentry = $elems->item(0);
if ($detect_title) {
// check for entry-title
$elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);
if ($elems && $elems->length > 0) {
$this->title = $elems->item(0)->textContent;
$this->debug('hNews: found entry-title: '.$this->title);
// remove title from document
$elems->item(0)->parentNode->removeChild($elems->item(0));
$detect_title = false;
}
}
if ($detect_date) {
// check for time element with pubdate attribute
$elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);
if ($elems && $elems->length > 0) {
$this->date = strtotime(trim($elems->item(0)->textContent));
// remove date from document
//$elems->item(0)->parentNode->removeChild($elems->item(0));
if ($this->date) {
$this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date));
$detect_date = false;
} else {
$this->date = null;
}
}
}
if ($detect_author) {
// check for time element with pubdate attribute
$elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry);
if ($elems && $elems->length > 0) {
$author = $elems->item(0);
$fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author);
if ($fn && $fn->length > 0) {
foreach ($fn as $_fn) {
if (trim($_fn->textContent) != '') {
$this->author[] = trim($_fn->textContent);
$this->debug('hNews: found author: '.trim($_fn->textContent));
}
}
} else {
if (trim($author->textContent) != '') {
$this->author[] = trim($author->textContent);
$this->debug('hNews: found author: '.trim($author->textContent));
}
}
$detect_author = empty($this->author);
}
}
// check for entry-content.
// according to hAtom spec, if there are multiple elements marked entry-content,
// we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
if ($detect_body) {
$elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);
if ($elems && $elems->length > 0) {
$this->debug('hNews: found entry-content');
if ($elems->length == 1) {
// what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
$e = $elems->item(0);
if (($e->tagName == 'img') || (trim($e->textContent) != '')) {
$this->body = $elems->item(0);
// prune (clean up elements that may not be content)
if ($this->config->prune()) {
$this->debug('Pruning content');
$this->readability->prepArticle($this->body);
}
$detect_body = false;
} else {
$this->debug('hNews: skipping entry-content - appears not to contain content');
}
unset($e);
} else {
$this->body = $this->readability->dom->createElement('div');
$this->debug($elems->length.' entry-content elems found');
foreach ($elems as $elem) {
if (!isset($elem->parentNode)) continue;
$isDescendant = false;
foreach ($this->body->childNodes as $parent) {
if ($this->isDescendant($parent, $elem)) {
$isDescendant = true;
break;
}
}
if ($isDescendant) {
$this->debug('Element is child of another body element, skipping.');
} else {
// prune (clean up elements that may not be content)
if ($this->config->prune()) {
$this->debug('Pruning content');
$this->readability->prepArticle($elem);
}
$this->debug('Element added to body');
$this->body->appendChild($elem);
}
}
$detect_body = false;
}
}
}
}
}
// check for elements marked with instapaper_title
if ($detect_title) {
// check for instapaper_title
$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);
if ($elems && $elems->length > 0) {
$this->title = $elems->item(0)->textContent;
$this->debug('Title found (.instapaper_title): '.$this->title);
// remove title from document
$elems->item(0)->parentNode->removeChild($elems->item(0));
$detect_title = false;
}
}
// check for elements marked with instapaper_body
if ($detect_body) {
$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);
if ($elems && $elems->length > 0) {
$this->debug('body found (.instapaper_body)');
$this->body = $elems->item(0);
// prune (clean up elements that may not be content)
if ($this->config->prune()) {
$this->debug('Pruning content');
$this->readability->prepArticle($this->body);
}
$detect_body = false;
}
}
// Find author in rel="author" marked element
// We only use this if there's exactly one.
// If there's more than one, it could indicate more than
// one author, but it could also indicate that we're processing
// a page listing different articles with different authors.
if ($detect_author) {
$elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom);
if ($elems && $elems->length == 1) {
$author = trim($elems->item(0)->textContent);
if ($author != '') {
$this->debug("Author found (rel=\"author\"): $author");
$this->author[] = $author;
$detect_author = false;
}
}
}
// Find date in pubdate marked time element
// For the same reason given above, we only use this
// if there's exactly one element.
if ($detect_date) {
$elems = @$xpath->query("//time[@pubdate]", $this->readability->dom);
if ($elems && $elems->length == 1) {
$this->date = strtotime(trim($elems->item(0)->textContent));
// remove date from document
//$elems->item(0)->parentNode->removeChild($elems->item(0));
if ($this->date) {
$this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date));
$detect_date = false;
} else {
$this->date = null;
}
}
}
// still missing title or body, so we detect using Readability
if ($detect_title || $detect_body) {
$this->debug('Using Readability');
// clone body if we're only using Readability for title (otherwise it may interfere with body element)
if (isset($this->body)) $this->body = $this->body->cloneNode(true);
$success = $this->readability->init();
}
if ($detect_title) {
$this->debug('Detecting title');
$this->title = $this->readability->getTitle()->textContent;
}
if ($detect_body && $success) {
$this->debug('Detecting body');
$this->body = $this->readability->getContent();
if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
$this->body = $this->body->firstChild;
}
// prune (clean up elements that may not be content)
if ($this->config->prune()) {
$this->debug('Pruning content');
$this->readability->prepArticle($this->body);
}
}
if (isset($this->body)) {
// remove scripts
$this->readability->removeScripts($this->body);
// remove any h1-h6 elements that appear as first thing in the body
// and which match our title
if (isset($this->title) && ($this->title != '')) {
$firstChild = $this->body->firstChild;
while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) {
$firstChild = $firstChild->nextSibling;
}
if (($firstChild->nodeType === XML_ELEMENT_NODE)
&& in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))
&& (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) {
$this->body->removeChild($firstChild);
}
}
// prevent self-closing iframes
$elems = $this->body->getElementsByTagName('iframe');
for ($i = $elems->length-1; $i >= 0; $i--) {
$e = $elems->item($i);
if (!$e->hasChildNodes()) {
$e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
}
}
// remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/
// the plugin replaces the src attribute to point to a 1x1 gif and puts the original src
// inside the data-lazy-src attribute. It also places the original image inside a noscript element
// next to the amended one.
$elems = @$xpath->query("//img[@data-lazy-src]", $this->body);
for ($i = $elems->length-1; $i >= 0; $i--) {
$e = $elems->item($i);
// let's see if we can grab image from noscript
if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') {
$_new_elem = $e->ownerDocument->createDocumentFragment();
@$_new_elem->appendXML($e->nextSibling->innerHTML);
$e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling);
$e->parentNode->removeChild($e);
} else {
// Use data-lazy-src as src value
$e->setAttribute('src', $e->getAttribute('data-lazy-src'));
$e->removeAttribute('data-lazy-src');
}
}
$this->success = true;
}
// if we've had no success and we've used tidy, there's a chance
// that tidy has messed up. So let's try again without tidy...
if (!$this->success && $tidied && $smart_tidy) {
$this->debug('Trying again without tidy');
$this->process($original_html, $url, false);
}
return $this->success;
}
private function isDescendant(DOMElement $parent, DOMElement $child) {
$node = $child->parentNode;
while ($node != null) {
if ($node->isSameNode($parent)) return true;
$node = $node->parentNode;
}
return false;
}
public function getContent() {
return $this->body;
}
public function getTitle() {
return $this->title;
}
public function getAuthors() {
return $this->author;
}
public function getLanguage() {
return $this->language;
}
public function getDate() {
return $this->date;
}
public function getSiteConfig() {
return $this->config;
}
public function getNextPageUrl() {
return $this->nextPageUrl;
}
}

View file

@ -1,343 +0,0 @@
<?php
/**
* Site Config
*
* Each instance of this class should hold extraction patterns and other directives
* for a website. See ContentExtractor class to see how it's used.
*
* @version 0.8
* @date 2013-04-16
* @author Keyvan Minoukadeh
* @copyright 2013 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
class SiteConfig
{
// Use first matching element as title (0 or more xpath expressions)
public $title = array();
// Use first matching element as body (0 or more xpath expressions)
public $body = array();
// Use first matching element as author (0 or more xpath expressions)
public $author = array();
// Use first matching element as date (0 or more xpath expressions)
public $date = array();
// Strip elements matching these xpath expressions (0 or more)
public $strip = array();
// Strip elements which contain these strings (0 or more) in the id or class attribute
public $strip_id_or_class = array();
// Strip images which contain these strings (0 or more) in the src attribute
public $strip_image_src = array();
// Additional HTTP headers to send
// NOT YET USED
public $http_header = array();
// Process HTML with tidy before creating DOM (bool or null if undeclared)
public $tidy = null;
protected $default_tidy = true; // used if undeclared
// Autodetect title/body if xpath expressions fail to produce results.
// Note that this applies to title and body separately, ie.
// * if we get a body match but no title match, this option will determine whether we autodetect title
// * if neither match, this determines whether we autodetect title and body.
// Also note that this only applies when there is at least one xpath expression in title or body, ie.
// * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected)
// * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results.
// Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content).
// bool or null if undeclared
public $autodetect_on_failure = null;
protected $default_autodetect_on_failure = true; // used if undeclared
// Clean up content block - attempt to remove elements that appear to be superfluous
// bool or null if undeclared
public $prune = null;
protected $default_prune = true; // used if undeclared
// Test URL - if present, can be used to test the config above
public $test_url = array();
// Single-page link - should identify a link element or URL pointing to the page holding the entire article
// This is useful for sites which split their articles across multiple pages. Links to such pages tend to
// display the first page with links to the other pages at the bottom. Often there is also a link to a page
// which displays the entire article on one page (e.g. 'print view').
// This should be an XPath expression identifying the link to that page. If present and we find a match,
// we will retrieve that page and the rest of the options in this config will be applied to the new page.
public $single_page_link = array();
public $next_page_link = array();
// Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
public $single_page_link_in_feed = array();
// Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
// string or null if undeclared
public $parser = null;
protected $default_parser = 'libxml'; // used if undeclared
// Strings to search for in HTML before processing begins (used with $replace_string)
public $find_string = array();
// Strings to replace those found in $find_string before HTML processing begins
public $replace_string = array();
// the options below cannot be set in the config files which this class represents
//public $cache_in_apc = false; // used to decide if we should cache in apc or not
public $cache_key = null;
public static $debug = false;
protected static $apc = false;
protected static $config_path;
protected static $config_path_fallback;
protected static $config_cache = array();
const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
protected static function debug($msg) {
if (self::$debug) {
//$mem = round(memory_get_usage()/1024, 2);
//$memPeak = round(memory_get_peak_usage()/1024, 2);
echo '* ',$msg;
//echo ' - mem used: ',$mem," (peak: $memPeak)\n";
echo "\n";
ob_flush();
flush();
}
}
// enable APC caching of certain site config files?
// If enabled the following site config files will be
// cached in APC cache (when requested for first time):
// * anything in site_config/custom/ and its corresponding file in site_config/standard/
// * the site config files associated with HTML fingerprints
// * the global site config file
// returns true if enabled, false otherwise
public static function use_apc($apc=true) {
if (!function_exists('apc_add')) {
if ($apc) self::debug('APC will not be used (function apc_add does not exist)');
return false;
}
self::$apc = $apc;
return $apc;
}
// return bool or null
public function tidy($use_default=true) {
if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;
return $this->tidy;
}
// return bool or null
public function prune($use_default=true) {
if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune;
return $this->prune;
}
// return string or null
public function parser($use_default=true) {
if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser;
return $this->parser;
}
// return bool or null
public function autodetect_on_failure($use_default=true) {
if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure;
return $this->autodetect_on_failure;
}
public static function set_config_path($path, $fallback=null) {
self::$config_path = $path;
self::$config_path_fallback = $fallback;
}
public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {
$key = strtolower($key);
if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
if ($config->cache_key) $key = $config->cache_key;
self::$config_cache[$key] = $config;
if (self::$apc && $use_apc) {
self::debug("Adding site config to APC cache with key sc.$key");
apc_add("sc.$key", $config);
}
self::debug("Cached site config with key $key");
}
public static function is_cached($key) {
$key = strtolower($key);
if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
if (array_key_exists($key, self::$config_cache)) {
return true;
} elseif (self::$apc && (bool)apc_fetch("sc.$key")) {
return true;
}
return false;
}
public function append(SiteConfig $newconfig) {
// check for commands where we accept multiple statements (no test_url)
foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) {
// append array elements for this config variable from $newconfig to this config
//$this->$var = $this->$var + $newconfig->$var;
$this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
}
// check for single statement commands
// we do not overwrite existing non null values
foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
if ($this->$var === null) $this->$var = $newconfig->$var;
}
// treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
foreach (array('find_string', 'replace_string') as $var) {
// append array elements for this config variable from $newconfig to this config
//$this->$var = $this->$var + $newconfig->$var;
$this->$var = array_merge($this->$var, $newconfig->$var);
}
}
// returns SiteConfig instance if an appropriate one is found, false otherwise
// if $exact_host_match is true, we will not look for wildcard config matches
// by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists
public static function build($host, $exact_host_match=false) {
$host = strtolower($host);
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;
// check for site configuration
$try = array($host);
// should we look for wildcard matches
if (!$exact_host_match) {
$split = explode('.', $host);
if (count($split) > 1) {
array_shift($split);
$try[] = '.'.implode('.', $split);
}
}
// look for site config file in primary folder
self::debug(". looking for site config for $host in primary folder");
foreach ($try as $h) {
if (array_key_exists($h, self::$config_cache)) {
self::debug("... site config for $h already loaded in this request");
return self::$config_cache[$h];
} elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) {
self::debug("... site config for $h in APC cache");
return $sconfig;
} elseif (file_exists(self::$config_path."/$h.txt")) {
self::debug("... found site config ($h.txt)");
$file_primary = self::$config_path."/$h.txt";
$matched_name = $h;
break;
}
}
// if we found site config, process it
if (isset($file_primary)) {
$config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
if (!$config_lines || !is_array($config_lines)) return false;
$config = self::build_from_array($config_lines);
// if APC caching is available and enabled, mark this for cache
//$config->cache_in_apc = true;
$config->cache_key = $matched_name;
// if autodetec on failure is off (on by default) we do not need to look
// in secondary folder
if (!$config->autodetect_on_failure()) {
self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
return $config;
}
}
// look for site config file in secondary folder
if (isset(self::$config_path_fallback)) {
self::debug(". looking for site config for $host in secondary folder");
foreach ($try as $h) {
if (file_exists(self::$config_path_fallback."/$h.txt")) {
self::debug("... found site config in secondary folder ($h.txt)");
$file_secondary = self::$config_path_fallback."/$h.txt";
$matched_name = $h;
break;
}
}
if (!isset($file_secondary)) {
self::debug("... no site config match in secondary folder");
}
}
// return false if no config file found
if (!isset($file_primary) && !isset($file_secondary)) {
self::debug("... no site config match for $host");
return false;
}
// return primary config if secondary not found
if (!isset($file_secondary) && isset($config)) {
return $config;
}
// process secondary config file
$config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
if (!$config_lines || !is_array($config_lines)) {
// failed to process secondary
if (isset($config)) {
// return primary config
return $config;
} else {
return false;
}
}
// merge with primary and return
if (isset($config)) {
self::debug('. merging config files');
$config->append(self::build_from_array($config_lines));
return $config;
} else {
// return just secondary
$config = self::build_from_array($config_lines);
// if APC caching is available and enabled, mark this for cache
//$config->cache_in_apc = true;
$config->cache_key = $matched_name;
return $config;
}
}
public static function build_from_array(array $lines) {
$config = new SiteConfig();
foreach ($lines as $line) {
$line = trim($line);
// skip comments, empty lines
if ($line == '' || $line[0] == '#') continue;
// get command
$command = explode(':', $line, 2);
// if there's no colon ':', skip this line
if (count($command) != 2) continue;
$val = trim($command[1]);
$command = trim($command[0]);
if ($command == '' || $val == '') continue;
// check for commands where we accept multiple statements
if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) {
array_push($config->$command, $val);
// check for single statement commands that evaluate to true or false
} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
$config->$command = ($val == 'yes');
// check for single statement commands stored as strings
} elseif (in_array($command, array('parser'))) {
$config->$command = $val;
// check for replace_string(find): replace
} elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {
if (in_array($match[1], array('replace_string'))) {
$command = $match[1];
array_push($config->find_string, $match[2]);
array_push($config->$command, $val);
}
}
}
return $config;
}
}

View file

@ -1,204 +0,0 @@
<?php
/**
* Univarsel Feed Writer
*
* FeedItem class - Used as feed element in FeedWriter class
*
* @package UnivarselFeedWriter
* @author Anis uddin Ahmad <anisniit@gmail.com>
* @link http://www.ajaxray.com/projects/rss
*/
class FeedItem
{
private $elements = array(); //Collection of feed elements
private $version;
/**
* Constructor
*
* @param contant (RSS1/RSS2/ATOM) RSS2 is default.
*/
function __construct($version = RSS2)
{
$this->version = $version;
}
/**
* Set element (overwrites existing elements with $elementName)
*
* @access public
* @param srting The tag name of an element
* @param srting The content of tag
* @param array Attributes(if any) in 'attrName' => 'attrValue' format
* @return void
*/
public function setElement($elementName, $content, $attributes = null)
{
if (isset($this->elements[$elementName])) {
unset($this->elements[$elementName]);
}
$this->addElement($elementName, $content, $attributes);
}
/**
* Add an element to elements array
*
* @access public
* @param srting The tag name of an element
* @param srting The content of tag
* @param array Attributes(if any) in 'attrName' => 'attrValue' format
* @return void
*/
public function addElement($elementName, $content, $attributes = null)
{
$i = 0;
if (isset($this->elements[$elementName])) {
$i = count($this->elements[$elementName]);
} else {
$this->elements[$elementName] = array();
}
$this->elements[$elementName][$i]['name'] = $elementName;
$this->elements[$elementName][$i]['content'] = $content;
$this->elements[$elementName][$i]['attributes'] = $attributes;
}
/**
* Set multiple feed elements from an array.
* Elements which have attributes cannot be added by this method
*
* @access public
* @param array array of elements in 'tagName' => 'tagContent' format.
* @return void
*/
public function addElementArray($elementArray)
{
if(! is_array($elementArray)) return;
foreach ($elementArray as $elementName => $content)
{
$this->addElement($elementName, $content);
}
}
/**
* Return the collection of elements in this feed item
*
* @access public
* @return array
*/
public function getElements()
{
return $this->elements;
}
// Wrapper functions ------------------------------------------------------
/**
* Set the 'dscription' element of feed item
*
* @access public
* @param string The content of 'description' element
* @return void
*/
public function setDescription($description)
{
$tag = ($this->version == ATOM)? 'summary' : 'description';
$this->setElement($tag, $description);
}
/**
* @desc Set the 'title' element of feed item
* @access public
* @param string The content of 'title' element
* @return void
*/
public function setTitle($title)
{
$this->setElement('title', $title);
}
/**
* Set the 'date' element of feed item
*
* @access public
* @param string The content of 'date' element
* @return void
*/
public function setDate($date)
{
if(! is_numeric($date))
{
$date = strtotime($date);
}
if($this->version == ATOM)
{
$tag = 'updated';
$value = date(DATE_ATOM, $date);
}
elseif($this->version == RSS2)
{
$tag = 'pubDate';
$value = date(DATE_RSS, $date);
}
else
{
$tag = 'dc:date';
$value = date("Y-m-d", $date);
}
$this->setElement($tag, $value);
}
/**
* Set the 'link' element of feed item
*
* @access public
* @param string The content of 'link' element
* @return void
*/
public function setLink($link)
{
if($this->version == RSS2 || $this->version == RSS1)
{
$this->setElement('link', $link);
$this->setElement('guid', $link);
}
else
{
$this->setElement('link','',array('href'=>$link));
$this->setElement('id', FeedWriter::uuid($link,'urn:uuid:'));
}
}
/**
* Set the 'source' element of feed item
*
* @access public
* @param string The content of 'source' element
* @return void
*/
public function setSource($link)
{
$attributes = array('url'=>$link);
$this->setElement('source', "wallabag",$attributes);
}
/**
* Set the 'encloser' element of feed item
* For RSS 2.0 only
*
* @access public
* @param string The url attribute of encloser tag
* @param string The length attribute of encloser tag
* @param string The type attribute of encloser tag
* @return void
*/
public function setEncloser($url, $length, $type)
{
$attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type);
$this->setElement('enclosure','',$attributes);
}
} // end of class FeedItem
?>

View file

@ -1,453 +0,0 @@
<?php
define('RSS2', 1, true);
define('JSON', 2, true);
define('JSONP', 3, true);
define('ATOM', 4, true);
/**
* Univarsel Feed Writer class
*
* Genarate RSS2 or JSON (original: RSS 1.0, RSS2.0 and ATOM Feed)
*
* Modified for FiveFilters.org's Full-Text RSS project
* to allow for inclusion of hubs, JSON output.
* Stripped RSS1 and ATOM support.
*
* @package UnivarselFeedWriter
* @author Anis uddin Ahmad <anisniit@gmail.com>
* @link http://www.ajaxray.com/projects/rss
*/
class FeedWriter
{
private $self = null; // self URL - http://feed2.w3.org/docs/warning/MissingAtomSelfLink.html
private $hubs = array(); // PubSubHubbub hubs
private $channels = array(); // Collection of channel elements
private $items = array(); // Collection of items as object of FeedItem class.
private $data = array(); // Store some other version wise data
private $CDATAEncoding = array(); // The tag names which have to encoded as CDATA
private $xsl = null; // stylesheet to render RSS (used by Chrome)
private $json = null; // JSON object
private $version = null;
/**
* Constructor
*
* @param constant the version constant (RSS2 or JSON).
*/
function __construct($version = RSS2)
{
$this->version = $version;
// Setting default value for assential channel elements
$this->channels['title'] = $version . ' Feed';
$this->channels['link'] = 'http://www.ajaxray.com/blog';
//Tag names to encode in CDATA
$this->CDATAEncoding = array('description', 'content:encoded', 'content', 'subtitle', 'summary');
}
public function setFormat($format) {
$this->version = $format;
}
// Start # public functions ---------------------------------------------
/**
* Set a channel element
* @access public
* @param srting name of the channel tag
* @param string content of the channel tag
* @return void
*/
public function setChannelElement($elementName, $content)
{
$this->channels[$elementName] = $content ;
}
/**
* Set multiple channel elements from an array. Array elements
* should be 'channelName' => 'channelContent' format.
*
* @access public
* @param array array of channels
* @return void
*/
public function setChannelElementsFromArray($elementArray)
{
if(! is_array($elementArray)) return;
foreach ($elementArray as $elementName => $content)
{
$this->setChannelElement($elementName, $content);
}
}
/**
* Genarate the actual RSS/JSON file
*
* @access public
* @return void
*/
public function genarateFeed($withHeaders = true)
{
if ($withHeaders) {
if ($this->version == RSS2) {
header('Content-type: text/xml; charset=UTF-8');
// this line prevents Chrome 20 from prompting download
// used by Google: https://news.google.com/news/feeds?ned=us&topic=b&output=rss
header('X-content-type-options: nosniff');
} elseif ($this->version == JSON) {
header('Content-type: application/json; charset=UTF-8');
} elseif ($this->version == JSONP) {
header('Content-type: application/javascript; charset=UTF-8');
}
}
if ($this->version == JSON || $this->version == JSONP) {
$this->json = new stdClass();
}
$this->printHead();
$this->printChannels();
$this->printItems();
$this->printTale();
if ($this->version == JSON || $this->version == JSONP) {
echo json_encode($this->json);
}
}
public function &getItems()
{
return $this->items;
}
/**
* Create a new FeedItem.
*
* @access public
* @return object instance of FeedItem class
*/
public function createNewItem()
{
$Item = new FeedItem($this->version);
return $Item;
}
/**
* Add a FeedItem to the main class
*
* @access public
* @param object instance of FeedItem class
* @return void
*/
public function addItem($feedItem)
{
$this->items[] = $feedItem;
}
// Wrapper functions -------------------------------------------------------------------
/**
* Set the 'title' channel element
*
* @access public
* @param srting value of 'title' channel tag
* @return void
*/
public function setTitle($title)
{
$this->setChannelElement('title', $title);
}
/**
* Add a hub to the channel element
*
* @access public
* @param string URL
* @return void
*/
public function addHub($hub)
{
$this->hubs[] = $hub;
}
/**
* Set XSL URL
*
* @access public
* @param string URL
* @return void
*/
public function setXsl($xsl)
{
$this->xsl = $xsl;
}
/**
* Set self URL
*
* @access public
* @param string URL
* @return void
*/
public function setSelf($self)
{
$this->self = $self;
}
/**
* Set the 'description' channel element
*
* @access public
* @param srting value of 'description' channel tag
* @return void
*/
public function setDescription($description)
{
$tag = ($this->version == ATOM)? 'subtitle' : 'description';
$this->setChannelElement($tag, $description);
}
/**
* Set the 'link' channel element
*
* @access public
* @param srting value of 'link' channel tag
* @return void
*/
public function setLink($link)
{
$this->setChannelElement('link', $link);
}
/**
* Set the 'image' channel element
*
* @access public
* @param srting title of image
* @param srting link url of the imahe
* @param srting path url of the image
* @return void
*/
public function setImage($title, $link, $url)
{
$this->setChannelElement('image', array('title'=>$title, 'link'=>$link, 'url'=>$url));
}
// End # public functions ----------------------------------------------
// Start # private functions ----------------------------------------------
/**
* Prints the xml and rss namespace
*
* @access private
* @return void
*/
private function printHead()
{
if ($this->version == RSS2)
{
$out = '<?xml version="1.0" encoding="utf-8"?>'."\n";
if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
$out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
echo $out;
}
elseif ($this->version == JSON || $this->version == JSONP)
{
$this->json->rss = array('@attributes' => array('version' => '2.0'));
}
}
/**
* Closes the open tags at the end of file
*
* @access private
* @return void
*/
private function printTale()
{
if ($this->version == RSS2)
{
echo '</channel>',PHP_EOL,'</rss>';
}
// do nothing for JSON
}
/**
* Creates a single node as xml format
*
* @access private
* @param string name of the tag
* @param mixed tag value as string or array of nested tags in 'tagName' => 'tagValue' format
* @param array Attributes(if any) in 'attrName' => 'attrValue' format
* @return string formatted xml tag
*/
private function makeNode($tagName, $tagContent, $attributes = null)
{
if ($this->version == RSS2)
{
$nodeText = '';
$attrText = '';
if (is_array($attributes))
{
foreach ($attributes as $key => $value)
{
$attrText .= " $key=\"$value\" ";
}
}
$nodeText .= "<{$tagName}{$attrText}>";
if (is_array($tagContent))
{
foreach ($tagContent as $key => $value)
{
$nodeText .= $this->makeNode($key, $value);
}
}
else
{
//$nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent);
$nodeText .= htmlspecialchars($tagContent);
}
//$nodeText .= (in_array($tagName, $this->CDATAEncoding))? "]]></$tagName>" : "</$tagName>";
$nodeText .= "</$tagName>";
return $nodeText . PHP_EOL;
}
elseif ($this->version == JSON || $this->version == JSONP)
{
$tagName = (string)$tagName;
$tagName = strtr($tagName, ':', '_');
$node = null;
if (!$tagContent && is_array($attributes) && count($attributes))
{
$node = array('@attributes' => $this->json_keys($attributes));
} else {
if (is_array($tagContent)) {
$node = $this->json_keys($tagContent);
} else {
$node = $tagContent;
}
}
return $node;
}
return ''; // should not get here
}
private function json_keys(array $array) {
$new = array();
foreach ($array as $key => $val) {
if (is_string($key)) $key = strtr($key, ':', '_');
if (is_array($val)) {
$new[$key] = $this->json_keys($val);
} else {
$new[$key] = $val;
}
}
return $new;
}
/**
* @desc Print channels
* @access private
* @return void
*/
private function printChannels()
{
//Start channel tag
if ($this->version == RSS2) {
echo '<channel>' . PHP_EOL;
// add hubs
foreach ($this->hubs as $hub) {
//echo $this->makeNode('link', '', array('rel'=>'hub', 'href'=>$hub, 'xmlns'=>'http://www.w3.org/2005/Atom'));
echo '<link rel="hub" href="'.htmlspecialchars($hub).'" xmlns="http://www.w3.org/2005/Atom" />' . PHP_EOL;
}
// add self
if (isset($this->self)) {
//echo $this->makeNode('link', '', array('rel'=>'self', 'href'=>$this->self, 'xmlns'=>'http://www.w3.org/2005/Atom'));
echo '<link rel="self" href="'.htmlspecialchars($this->self).'" xmlns="http://www.w3.org/2005/Atom" />' . PHP_EOL;
}
//Print Items of channel
foreach ($this->channels as $key => $value)
{
echo $this->makeNode($key, $value);
}
} elseif ($this->version == JSON || $this->version == JSONP) {
$this->json->rss['channel'] = (object)$this->json_keys($this->channels);
}
}
/**
* Prints formatted feed items
*
* @access private
* @return void
*/
private function printItems()
{
foreach ($this->items as $item) {
$itemElements = $item->getElements();
echo $this->startItem();
if ($this->version == JSON || $this->version == JSONP) {
$json_item = array();
}
foreach ($itemElements as $thisElement) {
foreach ($thisElement as $instance) {
if ($this->version == RSS2) {
echo $this->makeNode($instance['name'], $instance['content'], $instance['attributes']);
} elseif ($this->version == JSON || $this->version == JSONP) {
$_json_node = $this->makeNode($instance['name'], $instance['content'], $instance['attributes']);
if (count($thisElement) > 1) {
$json_item[strtr($instance['name'], ':', '_')][] = $_json_node;
} else {
$json_item[strtr($instance['name'], ':', '_')] = $_json_node;
}
}
}
}
echo $this->endItem();
if ($this->version == JSON || $this->version == JSONP) {
if (count($this->items) > 1) {
$this->json->rss['channel']->item[] = $json_item;
} else {
$this->json->rss['channel']->item = $json_item;
}
}
}
}
/**
* Make the starting tag of channels
*
* @access private
* @return void
*/
private function startItem()
{
if ($this->version == RSS2)
{
echo '<item>' . PHP_EOL;
}
// nothing for JSON
}
/**
* Closes feed item tag
*
* @access private
* @return void
*/
private function endItem()
{
if ($this->version == RSS2)
{
echo '</item>' . PHP_EOL;
}
// nothing for JSON
}
// End # private functions ----------------------------------------------
}

View file

@ -1,114 +0,0 @@
<?php
// warning: this file is encoded in UTF-8!
class HTML5_Data
{
// at some point this should be moved to a .ser file. Another
// possible optimization is to give UTF-8 bytes, not Unicode
// codepoints
// XXX: Not quite sure why it's named this; this is
// actually the numeric entity dereference table.
protected static $realCodepointTable = array(
0x00 => 0xFFFD, // REPLACEMENT CHARACTER
0x0D => 0x000A, // LINE FEED (LF)
0x80 => 0x20AC, // EURO SIGN ('€')
0x81 => 0x0081, // <control>
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('')
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
0x86 => 0x2020, // DAGGER ('†')
0x87 => 0x2021, // DOUBLE DAGGER ('‡')
0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
0x89 => 0x2030, // PER MILLE SIGN ('‰')
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('')
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
0x8D => 0x008D, // <control>
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
0x8F => 0x008F, // <control>
0x90 => 0x0090, // <control>
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('')
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('')
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
0x95 => 0x2022, // BULLET ('•')
0x96 => 0x2013, // EN DASH ('')
0x97 => 0x2014, // EM DASH ('—')
0x98 => 0x02DC, // SMALL TILDE ('˜')
0x99 => 0x2122, // TRADE MARK SIGN ('™')
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('')
0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
0x9D => 0x009D, // <control>
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
);
protected static $namedCharacterReferences;
protected static $namedCharacterReferenceMaxLength;
/**
* Returns the "real" Unicode codepoint of a malformed character
* reference.
*/
public static function getRealCodepoint($ref) {
if (!isset(self::$realCodepointTable[$ref])) return false;
else return self::$realCodepointTable[$ref];
}
public static function getNamedCharacterReferences() {
if (!self::$namedCharacterReferences) {
self::$namedCharacterReferences = unserialize(
file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
}
return self::$namedCharacterReferences;
}
/**
* Converts a Unicode codepoint to sequence of UTF-8 bytes.
* @note Shamelessly stolen from HTML Purifier, which is also
* shamelessly stolen from Feyd (which is in public domain).
*/
public static function utf8chr($code) {
/* We don't care: we live dangerously
* if($code > 0x10FFFF or $code < 0x0 or
($code >= 0xD800 and $code <= 0xDFFF) ) {
// bits are set outside the "valid" range as defined
// by UNICODE 4.1.0
return "\xEF\xBF\xBD";
}*/
$x = $y = $z = $w = 0;
if ($code < 0x80) {
// regular ASCII character
$x = $code;
} else {
// set up bits for UTF-8
$x = ($code & 0x3F) | 0x80;
if ($code < 0x800) {
$y = (($code & 0x7FF) >> 6) | 0xC0;
} else {
$y = (($code & 0xFC0) >> 6) | 0x80;
if($code < 0x10000) {
$z = (($code >> 12) & 0x0F) | 0xE0;
} else {
$z = (($code >> 12) & 0x3F) | 0x80;
$w = (($code >> 18) & 0x07) | 0xF0;
}
}
}
// set up the actual character
$ret = '';
if($w) $ret .= chr($w);
if($z) $ret .= chr($z);
if($y) $ret .= chr($y);
$ret .= chr($x);
return $ret;
}
}

View file

@ -1,284 +0,0 @@
<?php
/*
Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
// Some conventions:
// /* */ indicates verbatim text from the HTML 5 specification
// // indicates regular comments
class HTML5_InputStream {
/**
* The string data we're parsing.
*/
private $data;
/**
* The current integer byte position we are in $data
*/
private $char;
/**
* Length of $data; when $char === $data, we are at the end-of-file.
*/
private $EOF;
/**
* Parse errors.
*/
public $errors = array();
/**
* @param $data Data to parse
*/
public function __construct($data) {
/* Given an encoding, the bytes in the input stream must be
converted to Unicode characters for the tokeniser, as
described by the rules for that encoding, except that the
leading U+FEFF BYTE ORDER MARK character, if any, must not
be stripped by the encoding layer (it is stripped by the rule below).
Bytes or sequences of bytes in the original byte stream that
could not be converted to Unicode characters must be converted
to U+FFFD REPLACEMENT CHARACTER code points. */
// XXX currently assuming input data is UTF-8; once we
// build encoding detection this will no longer be the case
//
// We previously had an mbstring implementation here, but that
// implementation is heavily non-conforming, so it's been
// omitted.
if (extension_loaded('iconv')) {
// non-conforming
$data = @iconv('UTF-8', 'UTF-8//IGNORE', $data);
} else {
// we can make a conforming native implementation
throw new Exception('Not implemented, please install mbstring or iconv');
}
/* One leading U+FEFF BYTE ORDER MARK character must be
ignored if any are present. */
if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
$data = substr($data, 3);
}
/* All U+0000 NULL characters in the input must be replaced
by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
characters is a parse error. */
for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) {
$this->errors[] = array(
'type' => HTML5_Tokenizer::PARSEERROR,
'data' => 'null-character'
);
}
/* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED
(LF) characters are treated specially. Any CR characters
that are followed by LF characters must be removed, and any
CR characters not followed by LF characters must be converted
to LF characters. Thus, newlines in HTML DOMs are represented
by LF characters, and there are never any CR characters in the
input to the tokenization stage. */
$data = str_replace(
array(
"\0",
"\r\n",
"\r"
),
array(
"\xEF\xBF\xBD",
"\n",
"\n"
),
$data
);
/* Any occurrences of any characters in the ranges U+0001 to
U+0008, U+000B, U+000E to U+001F, U+007F to U+009F,
U+D800 to U+DFFF , U+FDD0 to U+FDEF, and
characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and
U+10FFFF are parse errors. (These are all control characters
or permanently undefined Unicode characters.) */
// Check PCRE is loaded.
if (extension_loaded('pcre')) {
$count = preg_match_all(
'/(?:
[\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F
|
\xC2[\x80-\x9F] # U+0080 to U+009F
|
\xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF
|
\xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF
|
\xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF
|
[\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16})
)/x',
$data,
$matches
);
for ($i = 0; $i < $count; $i++) {
$this->errors[] = array(
'type' => HTML5_Tokenizer::PARSEERROR,
'data' => 'invalid-codepoint'
);
}
} else {
// XXX: Need non-PCRE impl, probably using substr_count
}
$this->data = $data;
$this->char = 0;
$this->EOF = strlen($data);
}
/**
* Returns the current line that the tokenizer is at.
*/
public function getCurrentLine() {
// Check the string isn't empty
if($this->EOF) {
// Add one to $this->char because we want the number for the next
// byte to be processed.
return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
} else {
// If the string is empty, we are on the first line (sorta).
return 1;
}
}
/**
* Returns the current column of the current line that the tokenizer is at.
*/
public function getColumnOffset() {
// strrpos is weird, and the offset needs to be negative for what we
// want (i.e., the last \n before $this->char). This needs to not have
// one (to make it point to the next character, the one we want the
// position of) added to it because strrpos's behaviour includes the
// final offset byte.
$lastLine = strrpos($this->data, "\n", $this->char - 1 - strlen($this->data));
// However, for here we want the length up until the next byte to be
// processed, so add one to the current byte ($this->char).
if($lastLine !== false) {
$findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
} else {
$findLengthOf = substr($this->data, 0, $this->char);
}
// Get the length for the string we need.
if(extension_loaded('iconv')) {
return iconv_strlen($findLengthOf, 'utf-8');
} elseif(extension_loaded('mbstring')) {
return mb_strlen($findLengthOf, 'utf-8');
} elseif(extension_loaded('xml')) {
return strlen(utf8_decode($findLengthOf));
} else {
$count = count_chars($findLengthOf);
// 0x80 = 0x7F - 0 + 1 (one added to get inclusive range)
// 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range)
return array_sum(array_slice($count, 0, 0x80)) +
array_sum(array_slice($count, 0xC2, 0x33));
}
}
/**
* Retrieve the currently consume character.
* @note This performs bounds checking
*/
public function char() {
return ($this->char++ < $this->EOF)
? $this->data[$this->char - 1]
: false;
}
/**
* Get all characters until EOF.
* @note This performs bounds checking
*/
public function remainingChars() {
if($this->char < $this->EOF) {
$data = substr($this->data, $this->char);
$this->char = $this->EOF;
return $data;
} else {
return false;
}
}
/**
* Matches as far as possible until we reach a certain set of bytes
* and returns the matched substring.
* @param $bytes Bytes to match.
*/
public function charsUntil($bytes, $max = null) {
if ($this->char < $this->EOF) {
if ($max === 0 || $max) {
$len = strcspn($this->data, $bytes, $this->char, $max);
} else {
$len = strcspn($this->data, $bytes, $this->char);
}
$string = (string) substr($this->data, $this->char, $len);
$this->char += $len;
return $string;
} else {
return false;
}
}
/**
* Matches as far as possible with a certain set of bytes
* and returns the matched substring.
* @param $bytes Bytes to match.
*/
public function charsWhile($bytes, $max = null) {
if ($this->char < $this->EOF) {
if ($max === 0 || $max) {
$len = strspn($this->data, $bytes, $this->char, $max);
} else {
$len = strspn($this->data, $bytes, $this->char);
}
$string = (string) substr($this->data, $this->char, $len);
$this->char += $len;
return $string;
} else {
return false;
}
}
/**
* Unconsume one character.
*/
public function unget() {
if ($this->char <= $this->EOF) {
$this->char--;
}
}
}

View file

@ -1,36 +0,0 @@
<?php
require_once dirname(__FILE__) . '/Data.php';
require_once dirname(__FILE__) . '/InputStream.php';
require_once dirname(__FILE__) . '/TreeBuilder.php';
require_once dirname(__FILE__) . '/Tokenizer.php';
/**
* Outwards facing interface for HTML5.
*/
class HTML5_Parser
{
/**
* Parses a full HTML document.
* @param $text HTML text to parse
* @param $builder Custom builder implementation
* @return Parsed HTML as DOMDocument
*/
static public function parse($text, $builder = null) {
$tokenizer = new HTML5_Tokenizer($text, $builder);
$tokenizer->parse();
return $tokenizer->save();
}
/**
* Parses an HTML fragment.
* @param $text HTML text to parse
* @param $context String name of context element to pretend parsing is in.
* @param $builder Custom builder implementation
* @return Parsed HTML as DOMDocument
*/
static public function parseFragment($text, $context = null, $builder = null) {
$tokenizer = new HTML5_Tokenizer($text, $builder);
$tokenizer->parseFragment($context);
return $tokenizer->save();
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

View file

@ -1,403 +0,0 @@
<?php
/**
* Cookie Jar
*
* PHP class for handling cookies, as defined by the Netscape spec:
* <http://curl.haxx.se/rfc/cookie_spec.html>
*
* This class should be used to handle cookies (storing cookies from HTTP response messages, and
* sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org
* from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/
*
* This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/
* lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>.
* Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965.
*
* @version 0.5
* @date 2011-03-15
* @see http://php.net/HttpRequestPool
* @author Keyvan Minoukadeh
* @copyright 2011 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
class CookieJar
{
/**
* Cookies - array containing all cookies.
*
* <pre>
* Cookies are stored like this:
* [domain][path][name] = array
* where array is:
* 0 => value, 1 => secure, 2 => expires
* </pre>
* @var array
* @access private
*/
public $cookies = array();
public $debug = false;
/**
* Constructor
*/
function __construct() {
}
protected function debug($msg, $file=null, $line=null) {
if ($this->debug) {
$mem = round(memory_get_usage()/1024, 2);
$memPeak = round(memory_get_peak_usage()/1024, 2);
echo '* ',$msg;
if (isset($file, $line)) echo " ($file line $line)";
echo ' - mem used: ',$mem," (peak: $memPeak)\n";
ob_flush();
flush();
}
}
/**
* Get matching cookies
*
* Only use this method if you cannot use add_cookie_header(), for example, if you want to use
* this cookie jar class without using the request class.
*
* @param array $param associative array containing 'domain', 'path', 'secure' keys
* @return string
* @see add_cookie_header()
*/
public function getMatchingCookies($url)
{
if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) {
$param['domain'] = $parts['host'];
$param['path'] = $parts['path'];
$param['secure'] = (strtolower($parts['scheme']) == 'https');
unset($parts);
} else {
return false;
}
// RFC 2965 notes:
// If multiple cookies satisfy the criteria above, they are ordered in
// the Cookie header such that those with more specific Path attributes
// precede those with less specific. Ordering with respect to other
// attributes (e.g., Domain) is unspecified.
$domain = $param['domain'];
if (strpos($domain, '.') === false) $domain .= '.local';
$request_path = $param['path'];
if ($request_path == '') $request_path = '/';
$request_secure = $param['secure'];
$now = time();
$matched_cookies = array();
// domain - find matching domains
$this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__);
while (strpos($domain, '.') !== false) {
if (isset($this->cookies[$domain])) {
$this->debug(' domain match found: '.$domain);
$cookies =& $this->cookies[$domain];
} else {
$domain = $this->_reduce_domain($domain);
continue;
}
// paths - find matching paths starting from most specific
$this->debug(' - Finding matching paths for '.$request_path);
$paths = array_keys($cookies);
usort($paths, array($this, '_cmp_length'));
foreach ($paths as $path) {
// continue to next cookie if request path does not path-match cookie path
if (!$this->_path_match($request_path, $path)) continue;
// loop through cookie names
$this->debug(' path match found: '.$path);
foreach ($cookies[$path] as $name => $values) {
// if this cookie is secure but request isn't, continue to next cookie
if ($values[1] && !$request_secure) continue;
// if cookie is not a session cookie and has expired, continue to next cookie
if (is_int($values[2]) && ($values[2] < $now)) continue;
// cookie matches request
$this->debug(' cookie match: '.$name.'='.$values[0]);
$matched_cookies[] = $name.'='.$values[0];
}
}
$domain = $this->_reduce_domain($domain);
}
// return cookies
return implode('; ', $matched_cookies);
}
/**
* Parse Set-Cookie values.
*
* Only use this method if you cannot use extract_cookies(), for example, if you want to use
* this cookie jar class without using the response class.
*
* @param array $set_cookies array holding 1 or more "Set-Cookie" header values
* @param array $param associative array containing 'host', 'path' keys
* @return void
* @see extract_cookies()
*/
public function storeCookies($url, $set_cookies)
{
if (count($set_cookies) == 0) return;
$param = @parse_url($url);
if (!is_array($param) || !isset($param['host'])) return;
$request_host = $param['host'];
if (strpos($request_host, '.') === false) $request_host .= '.local';
$request_path = @$param['path'];
if ($request_path == '') $request_path = '/';
//
// loop through set-cookie headers
//
foreach ($set_cookies as $set_cookie) {
$this->debug('Parsing: '.$set_cookie);
// temporary cookie store (before adding to jar)
$tmp_cookie = array();
$param = explode(';', $set_cookie);
// loop through params
for ($x=0; $x<count($param); $x++) {
$key_val = explode('=', $param[$x], 2);
if (count($key_val) != 2) {
// if the first param isn't a name=value pair, continue to the next set-cookie
// header
if ($x == 0) continue 2;
// check for secure flag
if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true;
// continue to next param
continue;
}
list($key, $val) = array_map('trim', $key_val);
// first name=value pair is the cookie name and value
// the name and value are stored under 'name' and 'value' to avoid conflicts
// with later parameters.
if ($x == 0) {
$tmp_cookie = array('name'=>$key, 'value'=>$val);
continue;
}
$key = strtolower($key);
if (in_array($key, array('expires', 'path', 'domain', 'secure'))) {
$tmp_cookie[$key] = $val;
}
}
//
// set cookie
//
// check domain
if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) &&
($tmp_cookie['domain'] != ".$request_host")) {
$domain = $tmp_cookie['domain'];
if ((strpos($domain, '.') === false) && ($domain != 'local')) {
$this->debug(' - domain "'.$domain.'" has no dot and is not a local domain');
continue;
}
if (preg_match('/\.[0-9]+$/', $domain)) {
$this->debug(' - domain "'.$domain.'" appears to be an ip address');
continue;
}
if (substr($domain, 0, 1) != '.') $domain = ".$domain";
if (!$this->_domain_match($request_host, $domain)) {
$this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"');
continue;
}
} else {
// if domain is not specified in the set-cookie header, domain will default to
// the request host
$domain = $request_host;
}
// check path
if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) {
$path = urldecode($tmp_cookie['path']);
if (!$this->_path_match($request_path, $path)) {
$this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"');
continue;
}
} else {
$path = $request_path;
$path = substr($path, 0, strrpos($path, '/'));
if ($path == '') $path = '/';
}
// check if secure
$secure = (isset($tmp_cookie['secure'])) ? true : false;
// check expiry
if (isset($tmp_cookie['expires'])) {
if (($expires = strtotime($tmp_cookie['expires'])) < 0) {
$expires = null;
}
} else {
$expires = null;
}
// set cookie
$this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires);
}
}
// return array of set-cookie values extracted from HTTP response headers (string $h)
public function extractCookies($h) {
$x = 0;
$lines = 0;
$headers = array();
$last_match = false;
$h = explode("\n", $h);
foreach ($h as $line) {
$line = rtrim($line);
$lines++;
$trimmed_line = trim($line);
if (isset($line_last)) {
// check if we have \r\n\r\n (indicating the end of headers)
// some servers will not use CRLF (\r\n), so we make CR (\r) optional.
// if (preg_match('/\015?\012\015?\012/', $line_last.$line)) {
// break;
// }
// As an alternative, we can check if the current trimmed line is empty
if ($trimmed_line == '') {
break;
}
// check for continuation line...
// RFC 2616 Section 2.2 "Basic Rules":
// HTTP/1.1 header field values can be folded onto multiple lines if the
// continuation line begins with a space or horizontal tab. All linear
// white space, including folding, has the same semantics as SP. A
// recipient MAY replace any linear white space with a single SP before
// interpreting the field value or forwarding the message downstream.
if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) {
// append to previous header value
$headers[$x-1] .= ' '.rtrim($match[1]);
continue;
}
}
$line_last = $line;
// split header name and value
if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) {
$headers[$x++] = rtrim($match[1]);
$last_match = true;
} else {
$last_match = false;
}
}
return $headers;
}
/**
* Set Cookie
* @param string $domain
* @param string $path
* @param string $name cookie name
* @param string $value cookie value
* @param bool $secure
* @param int $expires expiry time (null if session cookie, <= 0 will delete cookie)
* @return void
*/
function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null)
{
if ($domain == '') return;
if ($path == '') return;
if ($name == '') return;
// check if cookie needs to go
if (isset($expires) && ($expires <= 0)) {
if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
return;
}
if ($value == '') return;
$this->cookies[$domain][$path][$name] = array($value, $secure, $expires);
return;
}
/**
* Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies.
* @param string $domain
* @param string $path
* @param string $name
* @return void
*/
function clear($domain=null, $path=null, $name=null)
{
if (!isset($domain)) {
$this->cookies = array();
} elseif (!isset($path)) {
if (isset($this->cookies[$domain])) unset($this->cookies[$domain]);
} elseif (!isset($name)) {
if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]);
} elseif (isset($name)) {
if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
}
}
/**
* Compare string length - used for sorting
* @access private
* @return int
*/
function _cmp_length($a, $b)
{
$la = strlen($a); $lb = strlen($b);
if ($la == $lb) return 0;
return ($la > $lb) ? -1 : 1;
}
/**
* Reduce domain
* @param string $domain
* @return string
* @access private
*/
function _reduce_domain($domain)
{
if ($domain == '') return '';
if (substr($domain, 0, 1) == '.') return substr($domain, 1);
return substr($domain, strpos($domain, '.'));
}
/**
* Path match - check if path1 path-matches path2
*
* From RFC 2965:
* <i>For two strings that represent paths, P1 and P2, P1 path-matches P2
* if P2 is a prefix of P1 (including the case where P1 and P2 string-
* compare equal). Thus, the string /tec/waldo path-matches /tec.</i>
* @param string $path1
* @param string $path2
* @return bool
* @access private
*/
function _path_match($path1, $path2)
{
return (substr($path1, 0, strlen($path2)) == $path2);
}
/**
* Domain match - check if domain1 domain-matches domain2
*
* A few extracts from RFC 2965:
* - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com
* would be rejected, because H is y.x and contains a dot.
*
* - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com
* would be accepted.
*
* - A Set-Cookie2 with Domain=.com or Domain=.com., will always be
* rejected, because there is no embedded dot.
*
* - A Set-Cookie2 from request-host example for Domain=.local will
* be accepted, because the effective host name for the request-
* host is example.local, and example.local domain-matches .local.
*
* I'm ignoring the first point for now (must check to see how other browsers handle
* this rule for Set-Cookie headers)
*
* @param string $domain1
* @param string $domain2
* @return bool
* @access private
*/
function _domain_match($domain1, $domain2)
{
$domain1 = strtolower($domain1);
$domain2 = strtolower($domain2);
while (strpos($domain1, '.') !== false) {
if ($domain1 == $domain2) return true;
$domain1 = $this->_reduce_domain($domain1);
continue;
}
return false;
}
}

View file

@ -1,810 +0,0 @@
<?php
/**
* Humble HTTP Agent
*
* This class is designed to take advantage of parallel HTTP requests
* offered by PHP's PECL HTTP extension or the curl_multi_* functions.
* For environments which do not have these options, it reverts to standard sequential
* requests (using file_get_contents())
*
* @version 1.4
* @date 2013-05-10
* @see http://php.net/HttpRequestPool
* @author Keyvan Minoukadeh
* @copyright 2011-2013 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
class HumbleHttpAgent
{
const METHOD_REQUEST_POOL = 1;
const METHOD_CURL_MULTI = 2;
const METHOD_FILE_GET_CONTENTS = 4;
//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
const UA_PHP = 'PHP/5.4';
const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
protected $requests = array();
protected $redirectQueue = array();
protected $requestOptions;
protected $maxParallelRequests = 5;
protected $cache = null; //TODO
protected $httpContext;
protected $minimiseMemoryUse = false; //TODO
protected $method;
protected $cookieJar;
public $debug = false;
public $debugVerbose = false;
public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
public $maxRedirects = 5;
public $userAgentMap = array();
public $rewriteUrls = array();
public $userAgentDefault;
public $referer;
//public $userAgent = 'Mozilla/5.0';
// Prevent certain file/mime types
// HTTP responses which match these content types will
// be returned without body.
public $headerOnlyTypes = array();
// URLs ending with one of these extensions will
// prompt Humble HTTP Agent to send a HEAD request first
// to see if returned content type matches $headerOnlyTypes.
public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');
// AJAX triggers to search for.
// for AJAX sites, e.g. Blogger with its dynamic views templates.
public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');
//TODO: set max file size
//TODO: normalise headers
function __construct($requestOptions=null, $method=null) {
$this->userAgentDefault = self::UA_BROWSER;
$this->referer = self::REF_GOOGLE;
// set the request method
if (in_array($method, array(1,2,4))) {
$this->method = $method;
} else {
if (class_exists('HttpRequestPool')) {
$this->method = self::METHOD_REQUEST_POOL;
} elseif (function_exists('curl_multi_init')) {
$this->method = self::METHOD_CURL_MULTI;
} else {
$this->method = self::METHOD_FILE_GET_CONTENTS;
}
}
if ($this->method == self::METHOD_CURL_MULTI) {
require_once(dirname(__FILE__).'/RollingCurl.php');
}
// create cookie jar
$this->cookieJar = new CookieJar();
// set request options (redirect must be 0)
$this->requestOptions = array(
'timeout' => 15,
'connecttimeout' => 15,
'dns_cache_timeout' => 300,
'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
// TODO: test onprogress?
);
if (is_array($requestOptions)) {
$this->requestOptions = array_merge($this->requestOptions, $requestOptions);
}
$this->httpContext = array(
'http' => array(
'ignore_errors' => true,
'timeout' => $this->requestOptions['timeout'],
'max_redirects' => $this->requestOptions['redirect'],
'header' => "Accept: */*\r\n"
)
);
}
protected function debug($msg) {
if ($this->debug) {
$mem = round(memory_get_usage()/1024, 2);
$memPeak = round(memory_get_peak_usage()/1024, 2);
echo '* ',$msg;
if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
echo "\n";
ob_flush();
flush();
}
}
protected function getUserAgent($url, $asArray=false) {
$host = @parse_url($url, PHP_URL_HOST);
if (strtolower(substr($host, 0, 4)) == 'www.') {
$host = substr($host, 4);
}
if ($host) {
$try = array($host);
$split = explode('.', $host);
if (count($split) > 1) {
array_shift($split);
$try[] = '.'.implode('.', $split);
}
foreach ($try as $h) {
if (isset($this->userAgentMap[$h])) {
$ua = $this->userAgentMap[$h];
break;
}
}
}
if (!isset($ua)) $ua = $this->userAgentDefault;
if ($asArray) {
return array('User-Agent' => $ua);
} else {
return 'User-Agent: '.$ua;
}
}
public function rewriteHashbangFragment($url) {
// return $url if there's no '#!'
if (strpos($url, '#!') === false) return $url;
// split $url and rewrite
// TODO: is SimplePie_IRI included?
$iri = new SimplePie_IRI($url);
$fragment = substr($iri->fragment, 1); // strip '!'
$iri->fragment = null;
if (isset($iri->query)) {
parse_str($iri->query, $query);
} else {
$query = array();
}
$query['_escaped_fragment_'] = (string)$fragment;
$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
return $iri->get_iri();
}
public function getRedirectURLfromHTML($url, $html) {
$redirect_url = $this->getMetaRefreshURL($url, $html);
if (!$redirect_url) {
$redirect_url = $this->getUglyURL($url, $html);
}
return $redirect_url;
}
public function getMetaRefreshURL($url, $html) {
if ($html == '') return false;
// <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) {
return false;
}
$redirect_url = $match[1];
if (preg_match('!^https?://!i', $redirect_url)) {
// already absolute
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
return $redirect_url;
}
// absolutize redirect URL
$base = new SimplePie_IRI($url);
// remove '//' in URL path (causes URLs not to resolve properly)
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
return $absolute;
}
return false;
}
public function getUglyURL($url, $html) {
if ($html == '') return false;
$found = false;
foreach ($this->ajaxTriggers as $string) {
if (stripos($html, $string)) {
$found = true;
break;
}
}
if (!$found) return false;
$iri = new SimplePie_IRI($url);
if (isset($iri->query)) {
parse_str($iri->query, $query);
} else {
$query = array();
}
$query['_escaped_fragment_'] = '';
$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
$ugly_url = $iri->get_iri();
$this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url);
return $ugly_url;
}
public function removeFragment($url) {
$pos = strpos($url, '#');
if ($pos === false) {
return $url;
} else {
return substr($url, 0, $pos);
}
}
public function rewriteUrls($url) {
foreach ($this->rewriteUrls as $find => $action) {
if (strpos($url, $find) !== false) {
if (is_array($action)) {
return strtr($url, $action);
}
}
}
return $url;
}
public function enableDebug($bool=true) {
$this->debug = (bool)$bool;
}
public function minimiseMemoryUse($bool = true) {
$this->minimiseMemoryUse = $bool;
}
public function setMaxParallelRequests($max) {
$this->maxParallelRequests = $max;
}
public function validateUrl($url) {
$url = filter_var($url, FILTER_SANITIZE_URL);
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
if ($test === false) {
$test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
}
if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
return $url;
} else {
return false;
}
}
public function fetchAll(array $urls) {
$this->fetchAllOnce($urls, $isRedirect=false);
$redirects = 0;
while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
$this->debug("Following redirects #$redirects...");
$this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
}
}
// fetch all URLs without following redirects
public function fetchAllOnce(array $urls, $isRedirect=false) {
if (!$isRedirect) $urls = array_unique($urls);
if (empty($urls)) return;
//////////////////////////////////////////////////////
// parallel (HttpRequestPool)
if ($this->method == self::METHOD_REQUEST_POOL) {
$this->debug('Starting parallel fetch (HttpRequestPool)');
try {
while (count($urls) > 0) {
$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
$subset = array_splice($urls, 0, $this->maxParallelRequests);
$pool = new HttpRequestPool();
foreach ($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
unset($this->redirectQueue[$orig]);
$this->debug("...$url");
if (!$isRedirect && isset($this->requests[$url])) {
$this->debug("......in memory");
/*
} elseif ($this->isCached($url)) {
$this->debug("......is cached");
if (!$this->minimiseMemoryUse) {
$this->requests[$url] = $this->getCached($url);
}
*/
} else {
$this->debug("......adding to pool");
$req_url = $this->rewriteUrls($url);
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url);
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
$_meth = HttpRequest::METH_HEAD;
} else {
$_meth = HttpRequest::METH_GET;
unset($this->requests[$orig]['wrongGuess']);
}
$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
// send cookies, if we have any
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
$this->debug("......sending cookies: $cookies");
$httpRequest->addHeaders(array('Cookie' => $cookies));
}
//$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
$httpRequest->addHeaders($this->getUserAgent($req_url, true));
// add referer for picky sites
$httpRequest->addheaders(array('Referer' => $this->referer));
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
$this->requests[$orig]['original_url'] = $orig;
$pool->attach($httpRequest);
}
}
// did we get anything into the pool?
if (count($pool) > 0) {
$this->debug('Sending request...');
try {
$pool->send();
} catch (HttpRequestPoolException $e) {
// do nothing
}
$this->debug('Received responses');
foreach($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
$request = $this->requests[$orig]['httpRequest'];
//$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
// getResponseHeader() doesn't return status line, so, for consistency...
$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
// check content type
// TODO: use getResponseHeader('content-type') or getResponseInfo()
if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
$this->requests[$orig]['body'] = '';
$_header_only_type = true;
$this->debug('Header only type returned');
} else {
$this->requests[$orig]['body'] = $request->getResponseBody();
$_header_only_type = false;
}
$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
// is redirect?
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
$redirectURL = $request->getResponseHeader('location');
if (!preg_match('!^https?://!i', $redirectURL)) {
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
}
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
$cookies = $request->getResponseHeader('set-cookie');
if ($cookies && !is_array($cookies)) $cookies = array($cookies);
if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
$this->redirectQueue[$orig] = $redirectURL;
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
}
} elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
// the response content-type did not match our 'header only' types,
// but we'd issues a HEAD request because we assumed it would. So
// let's queue a proper GET request for this item...
$this->debug('Wrong guess at content-type, queing GET request');
$this->requests[$orig]['wrongGuess'] = true;
$this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
} elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
// check for <meta name='fragment' content='!'/>
// for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (isset($this->requests[$orig]['body'])) {
$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
if ($redirectURL) {
$this->redirectQueue[$orig] = $redirectURL;
}
}
}
//die($url.' -multi- '.$request->getResponseInfo('effective_url'));
$pool->detach($request);
unset($this->requests[$orig]['httpRequest'], $request);
/*
if ($this->minimiseMemoryUse) {
if ($this->cache($url)) {
unset($this->requests[$url]);
}
}
*/
}
}
}
} catch (HttpException $e) {
$this->debug($e);
return false;
}
}
//////////////////////////////////////////////////////////
// parallel (curl_multi_*)
elseif ($this->method == self::METHOD_CURL_MULTI) {
$this->debug('Starting parallel fetch (curl_multi_*)');
while (count($urls) > 0) {
$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
$subset = array_splice($urls, 0, $this->maxParallelRequests);
$pool = new RollingCurl(array($this, 'handleCurlResponse'));
$pool->window_size = count($subset);
foreach ($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
unset($this->redirectQueue[$orig]);
$this->debug("...$url");
if (!$isRedirect && isset($this->requests[$url])) {
$this->debug("......in memory");
/*
} elseif ($this->isCached($url)) {
$this->debug("......is cached");
if (!$this->minimiseMemoryUse) {
$this->requests[$url] = $this->getCached($url);
}
*/
} else {
$this->debug("......adding to pool");
$req_url = $this->rewriteUrls($url);
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url);
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
$_meth = 'HEAD';
} else {
$_meth = 'GET';
unset($this->requests[$orig]['wrongGuess']);
}
$headers = array();
//$headers[] = 'User-Agent: '.$this->userAgent;
$headers[] = $this->getUserAgent($req_url);
// add referer for picky sites
$headers[] = 'Referer: '.$this->referer;
// send cookies, if we have any
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
$this->debug("......sending cookies: $cookies");
$headers[] = 'Cookie: '.$cookies;
}
$httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
CURLOPT_TIMEOUT => $this->requestOptions['timeout']
));
$httpRequest->set_original_url($orig);
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
$this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
$pool->add($httpRequest);
}
}
// did we get anything into the pool?
if (count($pool) > 0) {
$this->debug('Sending request...');
$pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
$this->debug('Received responses');
foreach($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
// $this->requests[$orig]['headers']
// $this->requests[$orig]['body']
// $this->requests[$orig]['effective_url']
// check content type
if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
$this->requests[$orig]['body'] = '';
$_header_only_type = true;
$this->debug('Header only type returned');
} else {
$_header_only_type = false;
}
$status_code = $this->requests[$orig]['status_code'];
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
$redirectURL = $this->requests[$orig]['location'];
if (!preg_match('!^https?://!i', $redirectURL)) {
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
}
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
$this->redirectQueue[$orig] = $redirectURL;
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
}
} elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {
// the response content-type did not match our 'header only' types,
// but we'd issues a HEAD request because we assumed it would. So
// let's queue a proper GET request for this item...
$this->debug('Wrong guess at content-type, queing GET request');
$this->requests[$orig]['wrongGuess'] = true;
$this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
} elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
// check for <meta name='fragment' content='!'/>
// for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (isset($this->requests[$orig]['body'])) {
$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
if ($redirectURL) {
$this->redirectQueue[$orig] = $redirectURL;
}
}
}
// die($url.' -multi- '.$request->getResponseInfo('effective_url'));
unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);
}
}
}
}
//////////////////////////////////////////////////////
// sequential (file_get_contents)
else {
$this->debug('Starting sequential fetch (file_get_contents)');
$this->debug('Processing set of '.count($urls));
foreach ($urls as $orig => $url) {
if (!$isRedirect) $orig = $url;
unset($this->redirectQueue[$orig]);
$this->debug("...$url");
if (!$isRedirect && isset($this->requests[$url])) {
$this->debug("......in memory");
/*
} elseif ($this->isCached($url)) {
$this->debug("......is cached");
if (!$this->minimiseMemoryUse) {
$this->requests[$url] = $this->getCached($url);
}
*/
} else {
$this->debug("Sending request for $url");
$this->requests[$orig]['original_url'] = $orig;
$req_url = $this->rewriteUrls($url);
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url);
// send cookies, if we have any
$httpContext = $this->httpContext;
$httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
// add referer for picky sites
$httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
$this->debug("......sending cookies: $cookies");
$httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
}
if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
$this->debug('Received response');
// get status code
if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
$this->debug('Error: no status code found');
// TODO: handle error - no status code
} else {
$this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
// check content type
if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
$this->requests[$orig]['body'] = '';
} else {
$this->requests[$orig]['body'] = $html;
}
$this->requests[$orig]['effective_url'] = $req_url;
$this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
unset($match);
// handle redirect
if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
$this->requests[$orig]['location'] = trim($match[1]);
}
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
$redirectURL = $this->requests[$orig]['location'];
if (!preg_match('!^https?://!i', $redirectURL)) {
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
}
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
$this->redirectQueue[$orig] = $redirectURL;
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
}
} elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
// check for <meta name='fragment' content='!'/>
// for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (isset($this->requests[$orig]['body'])) {
$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
if ($redirectURL) {
$this->redirectQueue[$orig] = $redirectURL;
}
}
}
}
} else {
$this->debug('Error retrieving URL');
//print_r($req_url);
//print_r($http_response_header);
//print_r($html);
// TODO: handle error - failed to retrieve URL
}
}
}
}
}
public function handleCurlResponse($response, $info, $request) {
$orig = $request->url_original;
$this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
$this->requests[$orig]['body'] = substr($response, $info['header_size']);
$this->requests[$orig]['method'] = $request->method;
$this->requests[$orig]['effective_url'] = $info['url'];
$this->requests[$orig]['status_code'] = (int)$info['http_code'];
if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
$this->requests[$orig]['location'] = trim($match[1]);
}
}
protected function headersToString(array $headers, $associative=true) {
if (!$associative) {
return implode("\n", $headers);
} else {
$str = '';
foreach ($headers as $key => $val) {
if (is_array($val)) {
foreach ($val as $v) $str .= "$key: $v\n";
} else {
$str .= "$key: $val\n";
}
}
return rtrim($str);
}
}
public function get($url, $remove=false, $gzdecode=true) {
$url = "$url";
if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
$this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
$response = $this->requests[$url];
/*
} elseif ($this->isCached($url)) {
$this->debug("URL already fetched - in disk cache ($url)");
$response = $this->getCached($url);
$this->requests[$url] = $response;
*/
} else {
$this->debug("Fetching URL ($url)");
$this->fetchAll(array($url));
if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
$response = $this->requests[$url];
} else {
$this->debug("Request failed");
$response = false;
}
}
/*
if ($this->minimiseMemoryUse && $response) {
$this->cache($url);
unset($this->requests[$url]);
}
*/
if ($remove && $response) unset($this->requests[$url]);
if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
if ($html = gzdecode($response['body'])) {
$response['body'] = $html;
}
}
return $response;
}
public function parallelSupport() {
return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
}
private function headerOnlyType($headers) {
if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {
// look for full mime type (e.g. image/jpeg) or just type (e.g. image)
$match[1] = strtolower(trim($match[1]));
$match[2] = strtolower(trim($match[2]));
foreach (array($match[1], $match[2]) as $mime) {
if (in_array($mime, $this->headerOnlyTypes)) return true;
}
}
return false;
}
private function possibleUnsupportedType($url) {
$path = @parse_url($url, PHP_URL_PATH);
if ($path && strpos($path, '.') !== false) {
$ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));
return in_array($ext, $this->headerOnlyClues);
}
return false;
}
}
// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
if (!function_exists('gzdecode')) {
function gzdecode($data,&$filename='',&$error='',$maxlength=null)
{
$len = strlen($data);
if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
$error = "Not in GZIP format.";
return null; // Not GZIP format (See RFC 1952)
}
$method = ord(substr($data,2,1)); // Compression method
$flags = ord(substr($data,3,1)); // Flags
if ($flags & 31 != $flags) {
$error = "Reserved bits not allowed.";
return null;
}
// NOTE: $mtime may be negative (PHP integer limitations)
$mtime = unpack("V", substr($data,4,4));
$mtime = $mtime[1];
$xfl = substr($data,8,1);
$os = substr($data,8,1);
$headerlen = 10;
$extralen = 0;
$extra = "";
if ($flags & 4) {
// 2-byte length prefixed EXTRA data in header
if ($len - $headerlen - 2 < 8) {
return false; // invalid
}
$extralen = unpack("v",substr($data,8,2));
$extralen = $extralen[1];
if ($len - $headerlen - 2 - $extralen < 8) {
return false; // invalid
}
$extra = substr($data,10,$extralen);
$headerlen += 2 + $extralen;
}
$filenamelen = 0;
$filename = "";
if ($flags & 8) {
// C-style string
if ($len - $headerlen - 1 < 8) {
return false; // invalid
}
$filenamelen = strpos(substr($data,$headerlen),chr(0));
if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
return false; // invalid
}
$filename = substr($data,$headerlen,$filenamelen);
$headerlen += $filenamelen + 1;
}
$commentlen = 0;
$comment = "";
if ($flags & 16) {
// C-style string COMMENT data in header
if ($len - $headerlen - 1 < 8) {
return false; // invalid
}
$commentlen = strpos(substr($data,$headerlen),chr(0));
if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
return false; // Invalid header format
}
$comment = substr($data,$headerlen,$commentlen);
$headerlen += $commentlen + 1;
}
$headercrc = "";
if ($flags & 2) {
// 2-bytes (lowest order) of CRC32 on header present
if ($len - $headerlen - 2 < 8) {
return false; // invalid
}
$calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
$headercrc = unpack("v", substr($data,$headerlen,2));
$headercrc = $headercrc[1];
if ($headercrc != $calccrc) {
$error = "Header checksum failed.";
return false; // Bad header CRC
}
$headerlen += 2;
}
// GZIP FOOTER
$datacrc = unpack("V",substr($data,-8,4));
$datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);
$isize = unpack("V",substr($data,-4));
$isize = $isize[1];
// decompression:
$bodylen = $len-$headerlen-8;
if ($bodylen < 1) {
// IMPLEMENTATION BUG!
return null;
}
$body = substr($data,$headerlen,$bodylen);
$data = "";
if ($bodylen > 0) {
switch ($method) {
case 8:
// Currently the only supported compression method:
$data = gzinflate($body,$maxlength);
break;
default:
$error = "Unknown compression method.";
return false;
}
} // zero-byte body content is allowed
// Verifiy CRC32
$crc = sprintf("%u",crc32($data));
$crcOK = $crc == $datacrc;
$lenOK = $isize == strlen($data);
if (!$lenOK || !$crcOK) {
$error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');
return false;
}
return $data;
}
}

View file

@ -1,402 +0,0 @@
<?php
/*
Authored by Josh Fraser (www.joshfraser.com)
Released under Apache License 2.0
Maintained by Alexander Makarov, http://rmcreative.ru/
Modified by Keyvan Minoukadeh for the Five Filters project: http://fivefilters.org
*/
/**
* Class that represent a single curl request
*/
class RollingCurlRequest {
public $url = false;
public $url_original = false; // used for tracking redirects
public $method = 'GET';
public $post_data = null;
public $headers = null;
public $options = null;
/**
* @param string $url
* @param string $method
* @param $post_data
* @param $headers
* @param $options
* @return void
*/
function __construct($url, $method = "GET", $post_data = null, $headers = null, $options = null) {
$this->url = $url;
$this->url_original = $url;
$this->method = $method;
$this->post_data = $post_data;
$this->headers = $headers;
$this->options = $options;
}
/**
* @param string $url
* @return void
*/
public function set_original_url($url) {
$this->url_original = $url;
}
/**
* @return void
*/
public function __destruct() {
unset($this->url, $this->url_original, $this->method, $this->post_data, $this->headers, $this->options);
}
}
/**
* RollingCurl custom exception
*/
class RollingCurlException extends Exception {
}
/**
* Class that holds a rolling queue of curl requests.
*
* @throws RollingCurlException
*/
class RollingCurl implements Countable {
/**
* @var int
*
* Window size is the max number of simultaneous connections allowed.
*
* REMEMBER TO RESPECT THE SERVERS:
* Sending too many requests at one time can easily be perceived
* as a DOS attack. Increase this window_size if you are making requests
* to multiple servers or have permission from the receving server admins.
*/
private $window_size = 5;
/**
* @var float
*
* Timeout is the timeout used for curl_multi_select.
*/
private $timeout = 10;
/**
* @var string|array
*
* Callback function to be applied to each result.
*/
private $callback;
/**
* @var array
*
* Set your base options that you want to be used with EVERY request.
*/
protected $options = array(
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_CONNECTTIMEOUT => 30,
CURLOPT_TIMEOUT => 30
);
/**
* @var array
*/
private $headers = array();
/**
* @var Request[]
*
* The request queue
*/
private $requests = array();
/**
* @var RequestMap[]
*
* Maps handles to request indexes
*/
private $requestMap = array();
/**
* @param $callback
* Callback function to be applied to each result.
*
* Can be specified as 'my_callback_function'
* or array($object, 'my_callback_method').
*
* Function should take three parameters: $response, $info, $request.
* $response is response body, $info is additional curl info.
* $request is the original request
*
* @return void
*/
function __construct($callback = null) {
$this->callback = $callback;
}
/**
* @param string $name
* @return mixed
*/
public function __get($name) {
return (isset($this->{$name})) ? $this->{$name} : null;
}
/**
* @param string $name
* @param mixed $value
* @return bool
*/
public function __set($name, $value) {
// append the base options & headers
if ($name == "options" || $name == "headers") {
$this->{$name} = $value + $this->{$name};
} else {
$this->{$name} = $value;
}
return true;
}
/**
* Count number of requests added (Countable interface)
*
* @return int
*/
public function count() {
return count($this->requests);
}
/**
* Add a request to the request queue
*
* @param Request $request
* @return bool
*/
public function add($request) {
$this->requests[] = $request;
return true;
}
/**
* Create new Request and add it to the request queue
*
* @param string $url
* @param string $method
* @param $post_data
* @param $headers
* @param $options
* @return bool
*/
public function request($url, $method = "GET", $post_data = null, $headers = null, $options = null) {
$this->requests[] = new RollingCurlRequest($url, $method, $post_data, $headers, $options);
return true;
}
/**
* Perform GET request
*
* @param string $url
* @param $headers
* @param $options
* @return bool
*/
public function get($url, $headers = null, $options = null) {
return $this->request($url, "GET", null, $headers, $options);
}
/**
* Perform POST request
*
* @param string $url
* @param $post_data
* @param $headers
* @param $options
* @return bool
*/
public function post($url, $post_data = null, $headers = null, $options = null) {
return $this->request($url, "POST", $post_data, $headers, $options);
}
/**
* Execute processing
*
* @param int $window_size Max number of simultaneous connections
* @return string|bool
*/
public function execute($window_size = null) {
// rolling curl window must always be greater than 1
if (sizeof($this->requests) == 1) {
return $this->single_curl();
} else {
// start the rolling curl. window_size is the max number of simultaneous connections
return $this->rolling_curl($window_size);
}
}
/**
* Performs a single curl request
*
* @access private
* @return string
*/
private function single_curl() {
$ch = curl_init();
$request = array_shift($this->requests);
$options = $this->get_options($request);
curl_setopt_array($ch, $options);
$output = curl_exec($ch);
$info = curl_getinfo($ch);
// it's not neccesary to set a callback for one-off requests
if ($this->callback) {
$callback = $this->callback;
if (is_callable($this->callback)) {
call_user_func($callback, $output, $info, $request);
}
}
else
return $output;
return true;
}
/**
* Performs multiple curl requests
*
* @access private
* @throws RollingCurlException
* @param int $window_size Max number of simultaneous connections
* @return bool
*/
private function rolling_curl($window_size = null) {
if ($window_size)
$this->window_size = $window_size;
// make sure the rolling window isn't greater than the # of urls
if (sizeof($this->requests) < $this->window_size)
$this->window_size = sizeof($this->requests);
if ($this->window_size < 2) {
throw new RollingCurlException("Window size must be greater than 1");
}
$master = curl_multi_init();
// start the first batch of requests
for ($i = 0; $i < $this->window_size; $i++) {
$ch = curl_init();
$options = $this->get_options($this->requests[$i]);
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
// Add to our request Maps
$key = (string) $ch;
$this->requestMap[$key] = $i;
}
do {
while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM) ;
if ($execrun != CURLM_OK)
break;
// a request was just completed -- find out which one
while ($done = curl_multi_info_read($master)) {
// get the info and content returned on the request
$info = curl_getinfo($done['handle']);
$output = curl_multi_getcontent($done['handle']);
// send the return values to the callback function.
$callback = $this->callback;
if (is_callable($callback)) {
$key = (string) $done['handle'];
$request = $this->requests[$this->requestMap[$key]];
unset($this->requestMap[$key]);
call_user_func($callback, $output, $info, $request);
}
// start a new request (it's important to do this before removing the old one)
if ($i < sizeof($this->requests) && isset($this->requests[$i]) && $i < count($this->requests)) {
$ch = curl_init();
$options = $this->get_options($this->requests[$i]);
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
// Add to our request Maps
$key = (string) $ch;
$this->requestMap[$key] = $i;
$i++;
}
// remove the curl handle that just completed
curl_multi_remove_handle($master, $done['handle']);
}
// Block for data in / output; error handling is done by curl_multi_exec
//if ($running) curl_multi_select($master, $this->timeout);
// removing timeout as it causes problems on Windows with PHP 5.3.5 and Curl 7.20.0
if ($running) curl_multi_select($master);
} while ($running);
curl_multi_close($master);
return true;
}
/**
* Helper function to set up a new request by setting the appropriate options
*
* @access private
* @param Request $request
* @return array
*/
private function get_options($request) {
// options for this entire curl object
$options = $this->__get('options');
// We're managing reirects in PHP - allows us to intervene and rewrite/block URLs
// before the next request goes out.
$options[CURLOPT_FOLLOWLOCATION] = 0;
$options[CURLOPT_MAXREDIRS] = 0;
//if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode')) {
// $options[CURLOPT_FOLLOWLOCATION] = 1;
// $options[CURLOPT_MAXREDIRS] = 5;
//}
$headers = $this->__get('headers');
// append custom headers for this specific request
if ($request->headers) {
$headers = $headers + $request->headers;
}
// append custom options for this specific request
if ($request->options) {
$options = $request->options + $options;
}
// set the request URL
$options[CURLOPT_URL] = $request->url;
if ($headers) {
$options[CURLOPT_HTTPHEADER] = $headers;
}
// return response headers
$options[CURLOPT_HEADER] = 1;
// send HEAD request?
if ($request->method == 'HEAD') {
$options[CURLOPT_NOBODY] = 1;
}
return $options;
}
/**
* @return void
*/
public function __destruct() {
unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests);
}
}

View file

@ -1,78 +0,0 @@
<?php
/**
* Humble HTTP Agent extension for SimplePie_File
*
* This class is designed to extend and override SimplePie_File
* in order to prevent duplicate HTTP requests being sent out.
* The idea is to initialise an instance of Humble HTTP Agent
* and attach it, to a static class variable, of this class.
* SimplePie will then automatically initialise this class
*
* @date 2011-02-28
*/
class SimplePie_HumbleHttpAgent extends SimplePie_File
{
protected static $agent;
var $url;
var $useragent;
var $success = true;
var $headers = array();
var $body;
var $status_code;
var $redirects = 0;
var $error;
var $method = SIMPLEPIE_FILE_SOURCE_NONE;
public static function set_agent(HumbleHttpAgent $agent) {
self::$agent = $agent;
}
public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) {
if (class_exists('idna_convert'))
{
$idn = new idna_convert();
$parsed = SimplePie_Misc::parse_url($url);
$url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']);
}
$this->url = $url;
$this->useragent = $useragent;
if (preg_match('/^http(s)?:\/\//i', $url))
{
if (!is_array($headers))
{
$headers = array();
}
$this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL;
$headers2 = array();
foreach ($headers as $key => $value) {
$headers2[] = "$key: $value";
}
//TODO: allow for HTTP headers
// curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2);
$response = self::$agent->get($url);
if ($response === false || !isset($response['status_code'])) {
$this->error = 'failed to fetch URL';
$this->success = false;
} else {
// The extra lines at the end are there to satisfy SimplePie's HTTP parser.
// The class expects a full HTTP message, whereas we're giving it only
// headers - the new lines indicate the start of the body.
$parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n");
if ($parser->parse()) {
$this->headers = $parser->headers;
//$this->body = $parser->body;
$this->body = $response['body'];
$this->status_code = $parser->status_code;
}
}
}
else
{
$this->error = 'invalid URL';
$this->success = false;
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -1,57 +0,0 @@
<?php
class Text_LanguageDetect_Exception extends Exception
{
/**
* Database file could not be found
*/
const DB_NOT_FOUND = 10;
/**
* Database file found, but not readable
*/
const DB_NOT_READABLE = 11;
/**
* Database file is empty
*/
const DB_EMPTY = 12;
/**
* Database contents is not a PHP array
*/
const DB_NOT_ARRAY = 13;
/**
* Magic quotes are activated
*/
const MAGIC_QUOTES = 14;
/**
* Parameter of invalid type passed to method
*/
const PARAM_TYPE = 20;
/**
* Character in parameter is invalid
*/
const INVALID_CHAR = 21;
/**
* Language is not in the database
*/
const UNKNOWN_LANGUAGE = 30;
/**
* Error during block detection
*/
const BLOCK_DETECTION = 40;
/**
* Error while clustering languages
*/
const NO_HIGHEST_KEY = 50;
}

View file

@ -1,339 +0,0 @@
<?php
/**
* Part of Text_LanguageDetect
*
* PHP version 5
*
* @category Text
* @package Text_LanguageDetect
* @author Christian Weiske <cweiske@php.net>
* @copyright 2011 Christian Weiske <cweiske@php.net>
* @license http://www.debian.org/misc/bsd.license BSD
* @version SVN: $Id$
* @link http://pear.php.net/package/Text_LanguageDetect/
*/
/**
* Provides a mapping between the languages from lang.dat and the
* ISO 639-1 and ISO-639-2 codes.
*
* Note that this class contains only languages that exist in lang.dat.
*
* @category Text
* @package Text_LanguageDetect
* @author Christian Weiske <cweiske@php.net>
* @copyright 2011 Christian Weiske <cweiske@php.net>
* @license http://www.debian.org/misc/bsd.license BSD
* @link http://www.loc.gov/standards/iso639-2/php/code_list.php
*/
class Text_LanguageDetect_ISO639
{
/**
* Maps all language names from the language database to the
* ISO 639-1 2-letter language code.
*
* NULL indicates that there is no 2-letter code.
*
* @var array
*/
public static $nameToCode2 = array(
'albanian' => 'sq',
'arabic' => 'ar',
'azeri' => 'az',
'bengali' => 'bn',
'bulgarian' => 'bg',
'cebuano' => null,
'croatian' => 'hr',
'czech' => 'cs',
'danish' => 'da',
'dutch' => 'nl',
'english' => 'en',
'estonian' => 'et',
'farsi' => 'fa',
'finnish' => 'fi',
'french' => 'fr',
'german' => 'de',
'hausa' => 'ha',
'hawaiian' => null,
'hindi' => 'hi',
'hungarian' => 'hu',
'icelandic' => 'is',
'indonesian' => 'id',
'italian' => 'it',
'kazakh' => 'kk',
'kyrgyz' => 'ky',
'latin' => 'la',
'latvian' => 'lv',
'lithuanian' => 'lt',
'macedonian' => 'mk',
'mongolian' => 'mn',
'nepali' => 'ne',
'norwegian' => 'no',
'pashto' => 'ps',
'pidgin' => null,
'polish' => 'pl',
'portuguese' => 'pt',
'romanian' => 'ro',
'russian' => 'ru',
'serbian' => 'sr',
'slovak' => 'sk',
'slovene' => 'sl',
'somali' => 'so',
'spanish' => 'es',
'swahili' => 'sw',
'swedish' => 'sv',
'tagalog' => 'tl',
'turkish' => 'tr',
'ukrainian' => 'uk',
'urdu' => 'ur',
'uzbek' => 'uz',
'vietnamese' => 'vi',
'welsh' => 'cy',
);
/**
* Maps all language names from the language database to the
* ISO 639-2 3-letter language code.
*
* @var array
*/
public static $nameToCode3 = array(
'albanian' => 'sqi',
'arabic' => 'ara',
'azeri' => 'aze',
'bengali' => 'ben',
'bulgarian' => 'bul',
'cebuano' => 'ceb',
'croatian' => 'hrv',
'czech' => 'ces',
'danish' => 'dan',
'dutch' => 'nld',
'english' => 'eng',
'estonian' => 'est',
'farsi' => 'fas',
'finnish' => 'fin',
'french' => 'fra',
'german' => 'deu',
'hausa' => 'hau',
'hawaiian' => 'haw',
'hindi' => 'hin',
'hungarian' => 'hun',
'icelandic' => 'isl',
'indonesian' => 'ind',
'italian' => 'ita',
'kazakh' => 'kaz',
'kyrgyz' => 'kir',
'latin' => 'lat',
'latvian' => 'lav',
'lithuanian' => 'lit',
'macedonian' => 'mkd',
'mongolian' => 'mon',
'nepali' => 'nep',
'norwegian' => 'nor',
'pashto' => 'pus',
'pidgin' => 'crp',
'polish' => 'pol',
'portuguese' => 'por',
'romanian' => 'ron',
'russian' => 'rus',
'serbian' => 'srp',
'slovak' => 'slk',
'slovene' => 'slv',
'somali' => 'som',
'spanish' => 'spa',
'swahili' => 'swa',
'swedish' => 'swe',
'tagalog' => 'tgl',
'turkish' => 'tur',
'ukrainian' => 'ukr',
'urdu' => 'urd',
'uzbek' => 'uzb',
'vietnamese' => 'vie',
'welsh' => 'cym',
);
/**
* Maps ISO 639-1 2-letter language codes to the language names
* in the language database
*
* Not all languages have a 2 letter code, so some are missing
*
* @var array
*/
public static $code2ToName = array(
'ar' => 'arabic',
'az' => 'azeri',
'bg' => 'bulgarian',
'bn' => 'bengali',
'cs' => 'czech',
'cy' => 'welsh',
'da' => 'danish',
'de' => 'german',
'en' => 'english',
'es' => 'spanish',
'et' => 'estonian',
'fa' => 'farsi',
'fi' => 'finnish',
'fr' => 'french',
'ha' => 'hausa',
'hi' => 'hindi',
'hr' => 'croatian',
'hu' => 'hungarian',
'id' => 'indonesian',
'is' => 'icelandic',
'it' => 'italian',
'kk' => 'kazakh',
'ky' => 'kyrgyz',
'la' => 'latin',
'lt' => 'lithuanian',
'lv' => 'latvian',
'mk' => 'macedonian',
'mn' => 'mongolian',
'ne' => 'nepali',
'nl' => 'dutch',
'no' => 'norwegian',
'pl' => 'polish',
'ps' => 'pashto',
'pt' => 'portuguese',
'ro' => 'romanian',
'ru' => 'russian',
'sk' => 'slovak',
'sl' => 'slovene',
'so' => 'somali',
'sq' => 'albanian',
'sr' => 'serbian',
'sv' => 'swedish',
'sw' => 'swahili',
'tl' => 'tagalog',
'tr' => 'turkish',
'uk' => 'ukrainian',
'ur' => 'urdu',
'uz' => 'uzbek',
'vi' => 'vietnamese',
);
/**
* Maps ISO 639-2 3-letter language codes to the language names
* in the language database.
*
* @var array
*/
public static $code3ToName = array(
'ara' => 'arabic',
'aze' => 'azeri',
'ben' => 'bengali',
'bul' => 'bulgarian',
'ceb' => 'cebuano',
'ces' => 'czech',
'crp' => 'pidgin',
'cym' => 'welsh',
'dan' => 'danish',
'deu' => 'german',
'eng' => 'english',
'est' => 'estonian',
'fas' => 'farsi',
'fin' => 'finnish',
'fra' => 'french',
'hau' => 'hausa',
'haw' => 'hawaiian',
'hin' => 'hindi',
'hrv' => 'croatian',
'hun' => 'hungarian',
'ind' => 'indonesian',
'isl' => 'icelandic',
'ita' => 'italian',
'kaz' => 'kazakh',
'kir' => 'kyrgyz',
'lat' => 'latin',
'lav' => 'latvian',
'lit' => 'lithuanian',
'mkd' => 'macedonian',
'mon' => 'mongolian',
'nep' => 'nepali',
'nld' => 'dutch',
'nor' => 'norwegian',
'pol' => 'polish',
'por' => 'portuguese',
'pus' => 'pashto',
'rom' => 'romanian',
'rus' => 'russian',
'slk' => 'slovak',
'slv' => 'slovene',
'som' => 'somali',
'spa' => 'spanish',
'sqi' => 'albanian',
'srp' => 'serbian',
'swa' => 'swahili',
'swe' => 'swedish',
'tgl' => 'tagalog',
'tur' => 'turkish',
'ukr' => 'ukrainian',
'urd' => 'urdu',
'uzb' => 'uzbek',
'vie' => 'vietnamese',
);
/**
* Returns the 2-letter ISO 639-1 code for the given language name.
*
* @param string $lang English language name like "swedish"
*
* @return string Two-letter language code (e.g. "sv") or NULL if not found
*/
public static function nameToCode2($lang)
{
$lang = strtolower($lang);
if (!isset(self::$nameToCode2[$lang])) {
return null;
}
return self::$nameToCode2[$lang];
}
/**
* Returns the 3-letter ISO 639-2 code for the given language name.
*
* @param string $lang English language name like "swedish"
*
* @return string Three-letter language code (e.g. "swe") or NULL if not found
*/
public static function nameToCode3($lang)
{
$lang = strtolower($lang);
if (!isset(self::$nameToCode3[$lang])) {
return null;
}
return self::$nameToCode3[$lang];
}
/**
* Returns the language name for the given 2-letter ISO 639-1 code.
*
* @param string $code Two-letter language code (e.g. "sv")
*
* @return string English language name like "swedish"
*/
public static function code2ToName($code)
{
$lang = strtolower($code);
if (!isset(self::$code2ToName[$code])) {
return null;
}
return self::$code2ToName[$code];
}
/**
* Returns the language name for the given 3-letter ISO 639-2 code.
*
* @param string $code Three-letter language code (e.g. "swe")
*
* @return string English language name like "swedish"
*/
public static function code3ToName($code)
{
$lang = strtolower($code);
if (!isset(self::$code3ToName[$code])) {
return null;
}
return self::$code3ToName[$code];
}
}

View file

@ -1,347 +0,0 @@
<?php
/**
* This class represents a text sample to be parsed.
*
* @category Text
* @package Text_LanguageDetect
* @author Nicholas Pisarro
* @copyright 2006
* @license BSD
* @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
* @link http://pear.php.net/package/Text_LanguageDetect/
* @link http://langdetect.blogspot.com/
*/
/**
* This class represents a text sample to be parsed.
*
* This separates the analysis of a text sample from the primary LanguageDetect
* class. After a new profile has been built, the data can be retrieved using
* the accessor functions.
*
* This class is intended to be used by the Text_LanguageDetect class, not
* end-users.
*
* @category Text
* @package Text_LanguageDetect
* @author Nicholas Pisarro
* @copyright 2006
* @license BSD
* @version release: 0.3.0
*/
class Text_LanguageDetect_Parser extends Text_LanguageDetect
{
/**
* the piece of text being parsed
*
* @access private
* @var string
*/
var $_string;
/**
* stores the trigram frequencies of the sample
*
* @access private
* @var string
*/
var $_trigrams = array();
/**
* stores the trigram ranks of the sample
*
* @access private
* @var array
*/
var $_trigram_ranks = array();
/**
* stores the unicode blocks of the sample
*
* @access private
* @var array
*/
var $_unicode_blocks = array();
/**
* Whether the parser should compile the unicode ranges
*
* @access private
* @var bool
*/
var $_compile_unicode = false;
/**
* Whether the parser should compile trigrams
*
* @access private
* @var bool
*/
var $_compile_trigram = false;
/**
* Whether the trigram parser should pad the beginning of the string
*
* @access private
* @var bool
*/
var $_trigram_pad_start = false;
/**
* Whether the unicode parser should skip non-alphabetical ascii chars
*
* @access private
* @var bool
*/
var $_unicode_skip_symbols = true;
/**
* Constructor
*
* @access private
* @param string $string string to be parsed
*/
function Text_LanguageDetect_Parser($string) {
$this->_string = $string;
}
/**
* Returns true if a string is suitable for parsing
*
* @param string $str input string to test
* @return bool true if acceptable, false if not
*/
public static function validateString($str) {
if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
return true;
} else {
return false;
}
}
/**
* turn on/off trigram counting
*
* @access public
* @param bool $bool true for on, false for off
*/
function prepareTrigram($bool = true)
{
$this->_compile_trigram = $bool;
}
/**
* turn on/off unicode block counting
*
* @access public
* @param bool $bool true for on, false for off
*/
function prepareUnicode($bool = true)
{
$this->_compile_unicode = $bool;
}
/**
* turn on/off padding the beginning of the sample string
*
* @access public
* @param bool $bool true for on, false for off
*/
function setPadStart($bool = true)
{
$this->_trigram_pad_start = $bool;
}
/**
* Should the unicode block counter skip non-alphabetical ascii chars?
*
* @access public
* @param bool $bool true for on, false for off
*/
function setUnicodeSkipSymbols($bool = true)
{
$this->_unicode_skip_symbols = $bool;
}
/**
* Returns the trigram ranks for the text sample
*
* @access public
* @return array trigram ranks in the text sample
*/
function &getTrigramRanks()
{
return $this->_trigram_ranks;
}
/**
* Return the trigram freqency table
*
* only used in testing to make sure the parser is working
*
* @access public
* @return array trigram freqencies in the text sample
*/
function &getTrigramFreqs()
{
return $this->_trigram;
}
/**
* returns the array of unicode blocks
*
* @access public
* @return array unicode blocks in the text sample
*/
function &getUnicodeBlocks()
{
return $this->_unicode_blocks;
}
/**
* Executes the parsing operation
*
* Be sure to call the set*() functions to set options and the
* prepare*() functions first to tell it what kind of data to compute
*
* Afterwards the get*() functions can be used to access the compiled
* information.
*
* @access public
*/
function analyze()
{
$len = strlen($this->_string);
$byte_counter = 0;
// unicode startup
if ($this->_compile_unicode) {
$blocks = $this->_read_unicode_block_db();
$block_count = count($blocks);
$skipped_count = 0;
$unicode_chars = array();
}
// trigram startup
if ($this->_compile_trigram) {
// initialize them as blank so the parser will skip the first two
// (since it skips trigrams with more than 2 contiguous spaces)
$a = ' ';
$b = ' ';
// kludge
// if it finds a valid trigram to start and the start pad option is
// off, then set a variable that will be used to reduce this
// trigram after parsing has finished
if (!$this->_trigram_pad_start) {
$a = $this->_next_char($this->_string, $byte_counter, true);
if ($a != ' ') {
$b = $this->_next_char($this->_string, $byte_counter, true);
$dropone = " $a$b";
}
$byte_counter = 0;
$a = ' ';
$b = ' ';
}
}
while ($byte_counter < $len) {
$char = $this->_next_char($this->_string, $byte_counter, true);
// language trigram detection
if ($this->_compile_trigram) {
if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
if (!isset($this->_trigram[$a . $b . $char])) {
$this->_trigram[$a . $b . $char] = 1;
} else {
$this->_trigram[$a . $b . $char]++;
}
}
$a = $b;
$b = $char;
}
// unicode block detection
if ($this->_compile_unicode) {
if ($this->_unicode_skip_symbols
&& strlen($char) == 1
&& ($char < 'A' || $char > 'z'
|| ($char > 'Z' && $char < 'a'))
&& $char != "'") { // does not skip the apostrophe
// since it's included in the language
// models
$skipped_count++;
continue;
}
// build an array of all the characters
if (isset($unicode_chars[$char])) {
$unicode_chars[$char]++;
} else {
$unicode_chars[$char] = 1;
}
}
// todo: add byte detection here
}
// unicode cleanup
if ($this->_compile_unicode) {
foreach ($unicode_chars as $utf8_char => $count) {
$search_result = $this->_unicode_block_name(
$this->_utf8char2unicode($utf8_char), $blocks, $block_count);
if ($search_result != -1) {
$block_name = $search_result[2];
} else {
$block_name = '[Malformatted]';
}
if (isset($this->_unicode_blocks[$block_name])) {
$this->_unicode_blocks[$block_name] += $count;
} else {
$this->_unicode_blocks[$block_name] = $count;
}
}
}
// trigram cleanup
if ($this->_compile_trigram) {
// pad the end
if ($b != ' ') {
if (!isset($this->_trigram["$a$b "])) {
$this->_trigram["$a$b "] = 1;
} else {
$this->_trigram["$a$b "]++;
}
}
// perl compatibility; Language::Guess does not pad the beginning
// kludge
if (isset($dropone)) {
if ($this->_trigram[$dropone] == 1) {
unset($this->_trigram[$dropone]);
} else {
$this->_trigram[$dropone]--;
}
}
if (!empty($this->_trigram)) {
$this->_trigram_ranks = $this->_arr_rank($this->_trigram);
} else {
$this->_trigram_ranks = array();
}
}
}
}
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1,912 +0,0 @@
<?php
// Full-Text RSS: Create Full-Text Feeds
// Author: Keyvan Minoukadeh
// Copyright (c) 2013 Keyvan Minoukadeh
// License: AGPLv3
// Version: 3.2
// Date: 2013-05-13
// More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org
/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
// Usage
// -----
// Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article
// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage
//error_reporting(E_ALL ^ E_NOTICE);
ini_set("display_errors", 1);
@set_time_limit(120);
libxml_use_internal_errors(true);
// Deal with magic quotes
if (get_magic_quotes_gpc()) {
$process = array(&$_GET, &$_POST, &$_REQUEST);
while (list($key, $val) = each($process)) {
foreach ($val as $k => $v) {
unset($process[$key][$k]);
if (is_array($v)) {
$process[$key][stripslashes($k)] = $v;
$process[] = &$process[$key][stripslashes($k)];
} else {
$process[$key][stripslashes($k)] = stripslashes($v);
}
}
}
unset($process);
}
// set include path
set_include_path(realpath(dirname(__FILE__).'/libraries').PATH_SEPARATOR.get_include_path());
require_once dirname(__FILE__).'/makefulltextfeedHelpers.php';
////////////////////////////////
// Load config file
////////////////////////////////
require dirname(__FILE__).'/config.php';
////////////////////////////////
// Prevent indexing/following by search engines because:
// 1. The content is already public and presumably indexed (why create duplicates?)
// 2. Not doing so might increase number of requests from search engines, thus increasing server load
// Note: feed readers and services such as Yahoo Pipes will not be affected by this header.
// Note: Using Disallow in a robots.txt file will be more effective (search engines will check
// that before even requesting makefulltextfeed.php).
////////////////////////////////
header('X-Robots-Tag: noindex, nofollow');
////////////////////////////////
// Check if service is enabled
////////////////////////////////
if (!$options->enabled) {
die('The full-text RSS service is currently disabled');
}
////////////////////////////////
// Debug mode?
// See the config file for debug options.
////////////////////////////////
$debug_mode = false;
if (isset($_GET['debug'])) {
if ($options->debug === true || $options->debug == 'user') {
$debug_mode = true;
} elseif ($options->debug == 'admin') {
session_start();
$debug_mode = (@$_SESSION['auth'] == 1);
}
if ($debug_mode) {
header('Content-Type: text/plain; charset=utf-8');
} else {
if ($options->debug == 'admin') {
die('You must be logged in to the <a href="admin/">admin area</a> to see debug output.');
} else {
die('Debugging is disabled.');
}
}
}
////////////////////////////////
// Check for APC
////////////////////////////////
$options->apc = $options->apc && function_exists('apc_add');
if ($options->apc) {
debug('APC is enabled and available on server');
} else {
debug('APC is disabled or not available on server');
}
////////////////////////////////
// Check for smart cache
////////////////////////////////
$options->smart_cache = $options->smart_cache && function_exists('apc_inc');
////////////////////////////////
// Check for feed URL
////////////////////////////////
if (!isset($_GET['url'])) {
die('No URL supplied');
}
$url = trim($_GET['url']);
if (strtolower(substr($url, 0, 7)) == 'feed://') {
$url = 'http://'.substr($url, 7);
}
if (!preg_match('!^https?://.+!i', $url)) {
$url = 'http://'.$url;
}
$url = filter_var($url, FILTER_SANITIZE_URL);
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
if ($test === false) {
$test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
}
if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
// all okay
unset($test);
} else {
die('Invalid URL supplied');
}
debug("Supplied URL: $url");
/////////////////////////////////
// Redirect to hide API key
/////////////////////////////////
if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->api_keys)) !== false) {
$host = $_SERVER['HTTP_HOST'];
$path = rtrim(dirname($_SERVER['SCRIPT_NAME']), '/\\');
$_qs_url = (strtolower(substr($url, 0, 7)) == 'http://') ? substr($url, 7) : $url;
$redirect = 'http://'.htmlspecialchars($host.$path).'/makefulltextfeed.php?url='.urlencode($_qs_url);
$redirect .= '&key='.$key_index;
$redirect .= '&hash='.urlencode(sha1($_GET['key'].$url));
if (isset($_GET['html'])) $redirect .= '&html='.urlencode($_GET['html']);
if (isset($_GET['max'])) $redirect .= '&max='.(int)$_GET['max'];
if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);
if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);
if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);
if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']);
if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']);
if (isset($_GET['xss'])) $redirect .= '&xss';
if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title';
if (isset($_GET['content'])) $redirect .= '&content='.urlencode($_GET['content']);
if (isset($_GET['summary'])) $redirect .= '&summary='.urlencode($_GET['summary']);
if (isset($_GET['debug'])) $redirect .= '&debug';
if ($debug_mode) {
debug('Redirecting to hide access key, follow URL below to continue');
debug("Location: $redirect");
} else {
header("Location: $redirect");
}
exit;
}
///////////////////////////////////////////////
// Set timezone.
// Prevents warnings, but needs more testing -
// perhaps if timezone is set in php.ini we
// don't need to set it at all...
///////////////////////////////////////////////
if (!ini_get('date.timezone') || !@date_default_timezone_set(ini_get('date.timezone'))) {
date_default_timezone_set('UTC');
}
///////////////////////////////////////////////
// Check if the request is explicitly for an HTML page
///////////////////////////////////////////////
$html_only = (isset($_GET['html']) && ($_GET['html'] == '1' || $_GET['html'] == 'true'));
///////////////////////////////////////////////
// Check if valid key supplied
///////////////////////////////////////////////
$valid_key = false;
if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int)$_GET['key']])) {
$valid_key = ($_GET['hash'] == sha1($options->api_keys[(int)$_GET['key']].$url));
}
$key_index = ($valid_key) ? (int)$_GET['key'] : 0;
if (!$valid_key && $options->key_required) {
die('A valid key must be supplied');
}
if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') {
die('The entered key is invalid');
}
if (file_exists('custom_init.php')) require 'custom_init.php';
///////////////////////////////////////////////
// Check URL against list of blacklisted URLs
///////////////////////////////////////////////
if (!url_allowed($url)) die('URL blocked');
///////////////////////////////////////////////
// Max entries
// see config.php to find these values
///////////////////////////////////////////////
if (isset($_GET['max'])) {
$max = (int)$_GET['max'];
if ($valid_key) {
$max = min($max, $options->max_entries_with_key);
} else {
$max = min($max, $options->max_entries);
}
} else {
if ($valid_key) {
$max = $options->default_entries_with_key;
} else {
$max = $options->default_entries;
}
}
///////////////////////////////////////////////
// Link handling
///////////////////////////////////////////////
if (isset($_GET['links']) && in_array($_GET['links'], array('preserve', 'footnotes', 'remove'))) {
$links = $_GET['links'];
} else {
$links = 'preserve';
}
///////////////////////////////////////////////
// Favour item titles in feed?
///////////////////////////////////////////////
$favour_feed_titles = true;
if ($options->favour_feed_titles == 'user') {
$favour_feed_titles = !isset($_GET['use_extracted_title']);
} else {
$favour_feed_titles = $options->favour_feed_titles;
}
///////////////////////////////////////////////
// Include full content in output?
///////////////////////////////////////////////
if ($options->content === 'user') {
if (isset($_GET['content']) && $_GET['content'] === '0') {
$options->content = false;
} else {
$options->content = true;
}
}
///////////////////////////////////////////////
// Include summaries in output?
///////////////////////////////////////////////
if ($options->summary === 'user') {
if (isset($_GET['summary']) && $_GET['summary'] === '1') {
$options->summary = true;
} else {
$options->summary = false;
}
}
///////////////////////////////////////////////
// Exclude items if extraction fails
///////////////////////////////////////////////
if ($options->exclude_items_on_fail === 'user') {
$exclude_on_fail = (isset($_GET['exc']) && ($_GET['exc'] == '1'));
} else {
$exclude_on_fail = $options->exclude_items_on_fail;
}
///////////////////////////////////////////////
// Detect language
///////////////////////////////////////////////
if ($options->detect_language === 'user') {
if (isset($_GET['l'])) {
$detect_language = (int)$_GET['l'];
} else {
$detect_language = 1;
}
} else {
$detect_language = $options->detect_language;
}
$use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0);
/////////////////////////////////////
// Check for valid format
// (stick to RSS (or RSS as JSON) for the time being)
/////////////////////////////////////
if (isset($_GET['format']) && $_GET['format'] == 'json') {
$format = 'json';
} else {
$format = 'rss';
}
/////////////////////////////////////
// Should we do XSS filtering?
/////////////////////////////////////
if ($options->xss_filter === 'user') {
$xss_filter = isset($_GET['xss']);
} else {
$xss_filter = $options->xss_filter;
}
if (!$xss_filter && isset($_GET['xss'])) {
die('XSS filtering is disabled in config');
}
/////////////////////////////////////
// Check for JSONP
// Regex from https://gist.github.com/1217080
/////////////////////////////////////
$callback = null;
if ($format =='json' && isset($_GET['callback'])) {
$callback = trim($_GET['callback']);
foreach (explode('.', $callback) as $_identifier) {
if (!preg_match('/^[a-zA-Z_$][0-9a-zA-Z_$]*(?:\[(?:".+"|\'.+\'|\d+)\])*?$/', $_identifier)) {
die('Invalid JSONP callback');
}
}
debug("JSONP callback: $callback");
}
//////////////////////////////////
// Enable Cross-Origin Resource Sharing (CORS)
//////////////////////////////////
if ($options->cors) header('Access-Control-Allow-Origin: *');
//////////////////////////////////
// Check for cached copy
//////////////////////////////////
if ($options->caching) {
debug('Caching is enabled...');
$cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));
$check_cache = true;
if ($options->apc && $options->smart_cache) {
apc_add("cache.$cache_id", 0, 10*60);
$apc_cache_hits = (int)apc_fetch("cache.$cache_id");
$check_cache = ($apc_cache_hits >= 2);
apc_inc("cache.$cache_id");
if ($check_cache) {
debug('Cache key found in APC, we\'ll try to load cache file from disk');
} else {
debug('Cache key not found in APC');
}
}
if ($check_cache) {
$cache = get_cache();
if ($data = $cache->load($cache_id)) {
if ($debug_mode) {
debug('Loaded cached copy');
exit;
}
if ($format == 'json') {
if ($callback === null) {
header('Content-type: application/json; charset=UTF-8');
} else {
header('Content-type: application/javascript; charset=UTF-8');
}
} else {
header('Content-type: text/xml; charset=UTF-8');
header('X-content-type-options: nosniff');
}
if (headers_sent()) die('Some data has already been output, can\'t send RSS file');
if ($callback) {
echo "$callback($data);";
} else {
echo $data;
}
exit;
}
}
}
//////////////////////////////////
// Set Expires header
//////////////////////////////////
if (!$debug_mode) {
header('Expires: ' . gmdate('D, d M Y H:i:s', time()+(60*10)) . ' GMT');
}
//////////////////////////////////
// Set up HTTP agent
//////////////////////////////////
global $http;
$http = new HumbleHttpAgent();
$http->debug = $debug_mode;
$http->userAgentMap = $options->user_agents;
$http->headerOnlyTypes = array_keys($options->content_type_exc);
$http->rewriteUrls = $options->rewrite_url;
//////////////////////////////////
// Set up Content Extractor
//////////////////////////////////
global $extractor;
$extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', dirname(__FILE__).'/site_config/standard');
$extractor->debug = $debug_mode;
SiteConfig::$debug = $debug_mode;
SiteConfig::use_apc($options->apc);
$extractor->fingerprints = $options->fingerprints;
$extractor->allowedParsers = $options->allowed_parsers;
////////////////////////////////
// Get RSS/Atom feed
////////////////////////////////
if (!$html_only) {
debug('--------');
debug("Attempting to process URL as feed");
// Send user agent header showing PHP (prevents a HTML response from feedburner)
$http->userAgentDefault = HumbleHttpAgent::UA_PHP;
// configure SimplePie HTTP extension class to use our HumbleHttpAgent instance
SimplePie_HumbleHttpAgent::set_agent($http);
$feed = new SimplePie();
// some feeds use the text/html content type - force_feed tells SimplePie to process anyway
$feed->force_feed(true);
$feed->set_file_class('SimplePie_HumbleHttpAgent');
//$feed->set_feed_url($url); // colons appearing in the URL's path get encoded
$feed->feed_url = $url;
$feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);
$feed->set_timeout(20);
$feed->enable_cache(false);
$feed->set_stupidly_fast(true);
$feed->enable_order_by_date(false); // we don't want to do anything to the feed
$feed->set_url_replacements(array());
// initialise the feed
// the @ suppresses notices which on some servers causes a 500 internal server error
$result = @$feed->init();
//$feed->handle_content_type();
//$feed->get_title();
if ($result && (!is_array($feed->data) || count($feed->data) == 0)) {
die('Sorry, no feed items found');
}
// from now on, we'll identify ourselves as a browser
$http->userAgentDefault = HumbleHttpAgent::UA_BROWSER;
}
////////////////////////////////////////////////////////////////////////////////
// Our given URL is not a feed, so let's create our own feed with a single item:
// the given URL. This basically treats all non-feed URLs as if they were
// single-item feeds.
////////////////////////////////////////////////////////////////////////////////
$isDummyFeed = false;
if ($html_only || !$result) {
debug('--------');
debug("Constructing a single-item feed from URL");
$isDummyFeed = true;
unset($feed, $result);
// create single item dummy feed object
$feed = new DummySingleItemFeed($url);
}
////////////////////////////////////////////
// Create full-text feed
////////////////////////////////////////////
$output = new FeedWriter();
$output->setTitle(strip_tags($feed->get_title()));
$output->setDescription(strip_tags($feed->get_description()));
$output->setXsl('css/feed.xsl'); // Chrome uses this, most browsers ignore it
if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
$output->addHub('http://fivefilters.superfeedr.com/');
$output->addHub('http://pubsubhubbub.appspot.com/');
$output->setSelf('http://'.$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']);
}
$output->setLink($feed->get_link()); // Google Reader uses this for pulling in favicons
if ($img_url = $feed->get_image_url()) {
$output->setImage($feed->get_title(), $feed->get_link(), $img_url);
}
////////////////////////////////////////////
// Loop through feed items
////////////////////////////////////////////
$items = $feed->get_items(0, $max);
// Request all feed items in parallel (if supported)
$urls_sanitized = array();
$urls = array();
foreach ($items as $key => $item) {
$permalink = htmlspecialchars_decode($item->get_permalink());
// Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
$permalink = str_replace('%3A', ':', $permalink);
// validateUrl() strips non-ascii characters
// simplepie already sanitizes URLs so let's not do it again here.
//$permalink = $http->validateUrl($permalink);
if ($permalink) {
$urls_sanitized[] = $permalink;
}
$urls[$key] = $permalink;
}
debug('--------');
debug('Fetching feed items');
$http->fetchAll($urls_sanitized);
//$http->cacheAll();
// count number of items added to full feed
$item_count = 0;
foreach ($items as $key => $item) {
debug('--------');
debug('Processing feed item '.($item_count+1));
$do_content_extraction = true;
$extract_result = false;
$text_sample = null;
$permalink = $urls[$key];
debug("Item URL: $permalink");
$extracted_title = '';
$feed_item_title = $item->get_title();
if ($feed_item_title !== null) {
$feed_item_title = strip_tags(htmlspecialchars_decode($feed_item_title));
}
$newitem = $output->createNewItem();
$newitem->setTitle($feed_item_title);
if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
if ($permalink !== false) {
$newitem->setLink('http://fivefilters.org/content-only/redirect.php?url='.urlencode($permalink));
} else {
$newitem->setLink('http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()));
}
} else {
if ($permalink !== false) {
$newitem->setLink($permalink);
} else {
$newitem->setLink($item->get_permalink());
}
}
//if ($permalink && ($response = $http->get($permalink, true)) && $response['status_code'] < 300) {
// Allowing error codes - some sites return correct content with error status
// e.g. prospectmagazine.co.uk returns 403
if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
$effective_url = $response['effective_url'];
if (!url_allowed($effective_url)) continue;
// check if action defined for returned Content-Type
$mime_info = get_mime_action_info($response['headers']);
if (isset($mime_info['action'])) {
if ($mime_info['action'] == 'exclude') {
continue; // skip this feed item entry
} elseif ($mime_info['action'] == 'link') {
if ($mime_info['type'] == 'image') {
$html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"{$mime_info['name']}\" /></a>";
} else {
$html = "<a href=\"$effective_url\">Download {$mime_info['name']}</a>";
}
$extracted_title = $mime_info['name'];
$do_content_extraction = false;
}
}
if ($do_content_extraction) {
$html = $response['body'];
// remove strange things
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $response['headers']);
// check site config for single page URL - fetch it if found
$is_single_page = false;
if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
$is_single_page = true;
$effective_url = $single_page_response['effective_url'];
// check if action defined for returned Content-Type
$mime_info = get_mime_action_info($single_page_response['headers']);
if (isset($mime_info['action'])) {
if ($mime_info['action'] == 'exclude') {
continue; // skip this feed item entry
} elseif ($mime_info['action'] == 'link') {
if ($mime_info['type'] == 'image') {
$html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"{$mime_info['name']}\" /></a>";
} else {
$html = "<a href=\"$effective_url\">Download {$mime_info['name']}</a>";
}
$extracted_title = $mime_info['name'];
$do_content_extraction = false;
}
}
if ($do_content_extraction) {
$html = $single_page_response['body'];
// remove strange things
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $single_page_response['headers']);
debug("Retrieved single-page view from $effective_url");
}
unset($single_page_response);
}
}
if ($do_content_extraction) {
debug('--------');
debug('Attempting to extract content');
$extract_result = $extractor->process($html, $effective_url);
$readability = $extractor->readability;
$content_block = ($extract_result) ? $extractor->getContent() : null;
$extracted_title = ($extract_result) ? $extractor->getTitle() : '';
// Deal with multi-page articles
//die('Next: '.$extractor->getNextPageUrl());
$is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl());
if ($options->multipage && $is_multi_page && $options->content) {
debug('--------');
debug('Attempting to process multi-page article');
$multi_page_urls = array();
$multi_page_content = array();
while ($next_page_url = $extractor->getNextPageUrl()) {
debug('--------');
debug('Processing next page: '.$next_page_url);
// If we've got URL, resolve against $url
if ($next_page_url = makeAbsoluteStr($effective_url, $next_page_url)) {
// check it's not what we have already!
if (!in_array($next_page_url, $multi_page_urls)) {
// it's not, so let's attempt to fetch it
$multi_page_urls[] = $next_page_url;
$_prev_ref = $http->referer;
if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) {
// make sure mime type is not something with a different action associated
$page_mime_info = get_mime_action_info($response['headers']);
if (!isset($page_mime_info['action'])) {
$html = $response['body'];
// remove strange things
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $response['headers']);
if ($extractor->process($html, $next_page_url)) {
$multi_page_content[] = $extractor->getContent();
continue;
} else { debug('Failed to extract content'); }
} else { debug('MIME type requires different action'); }
} else { debug('Failed to fetch URL'); }
} else { debug('URL already processed'); }
} else { debug('Failed to resolve against '.$effective_url); }
// failed to process next_page_url, so cancel further requests
$multi_page_content = array();
break;
}
// did we successfully deal with this multi-page article?
if (empty($multi_page_content)) {
debug('Failed to extract all parts of multi-page article, so not going to include them');
$_page = $readability->dom->createElement('p');
$_page->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';
$multi_page_content[] = $_page;
}
foreach ($multi_page_content as $_page) {
$_page = $content_block->ownerDocument->importNode($_page, true);
$content_block->appendChild($_page);
}
unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url, $_page);
}
}
// use extracted title for both feed and item title if we're using single-item dummy feed
if ($isDummyFeed) {
$output->setTitle($extracted_title);
$newitem->setTitle($extracted_title);
} else {
// use extracted title instead of feed item title?
if (!$favour_feed_titles && $extracted_title != '') {
debug('Using extracted title in generated feed');
$newitem->setTitle($extracted_title);
}
}
}
if ($do_content_extraction) {
// if we failed to extract content...
if (!$extract_result) {
if ($exclude_on_fail) {
debug('Failed to extract, so skipping (due to exclude on fail parameter)');
continue; // skip this and move to next item
}
//TODO: get text sample for language detection
$html = $options->error_message;
// keep the original item description
$html .= $item->get_description();
} else {
$readability->clean($content_block, 'select');
// get base URL
$base_url = get_base_url($readability->dom);
if (!$base_url) $base_url = $effective_url;
// rewrite URLs
if ($options->rewrite_relative_urls) makeAbsolute($base_url, $content_block);
// footnotes
if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {
$readability->addFootnotes($content_block);
}
// remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p>
while ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
// only follow these tag names
if (!in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) break;
//$html = $content_block->firstChild->innerHTML; // FTR 2.9.5
$content_block = $content_block->firstChild;
}
// convert content block to HTML string
// Need to preserve things like body: //img[@id='feature']
if (in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) {
$html = $content_block->innerHTML;
} else {
$html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML
}
//unset($content_block);
// post-processing cleanup
$html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
if ($links == 'remove') {
$html = preg_replace('!</?a[^>]*>!', '', $html);
}
// get text sample for language detection
$text_sample = strip_tags(substr($html, 0, 500));
$html = make_substitutions($options->message_to_prepend).$html;
$html .= make_substitutions($options->message_to_append);
}
}
if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
$newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));
} else {
$newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));
}
// filter xss?
if ($xss_filter) {
debug('Filtering HTML to remove XSS');
$html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1));
}
// add content
if ($options->summary === true) {
// get summary
$summary = '';
if (!$do_content_extraction) {
$summary = $html;
} else {
// Try to get first few paragraphs
if (isset($content_block) && ($content_block instanceof DOMElement)) {
$_paras = $content_block->getElementsByTagName('p');
foreach ($_paras as $_para) {
$summary .= preg_replace("/[\n\r\t ]+/", ' ', $_para->textContent).' ';
if (strlen($summary) > 200) break;
}
} else {
$summary = $html;
}
}
unset($_paras, $_para);
$summary = get_excerpt($summary);
$newitem->setDescription($summary);
if ($options->content) $newitem->setElement('content:encoded', $html);
} else {
if ($options->content) $newitem->setDescription($html);
}
// set date
if ((int)$item->get_date('U') > 0) {
$newitem->setDate((int)$item->get_date('U'));
} elseif ($extractor->getDate()) {
$newitem->setDate($extractor->getDate());
}
// add authors
if ($authors = $item->get_authors()) {
foreach ($authors as $author) {
// for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel
if ($author->get_name() !== null) {
$newitem->addElement('dc:creator', $author->get_name());
} elseif ($author->get_email() !== null) {
$newitem->addElement('dc:creator', $author->get_email());
}
}
} elseif ($authors = $extractor->getAuthors()) {
//TODO: make sure the list size is reasonable
foreach ($authors as $author) {
// TODO: xpath often selects authors from other articles linked from the page.
// for now choose first item
$newitem->addElement('dc:creator', $author);
break;
}
}
// add language
if ($detect_language) {
$language = $extractor->getLanguage();
if (!$language) $language = $feed->get_language();
if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) {
try {
if ($use_cld) {
// Use PHP-CLD extension
$php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error
$res = $php_cld($text_sample);
if (is_array($res) && count($res) > 0) {
$language = $res[0]['code'];
}
} else {
//die('what');
// Use PEAR's Text_LanguageDetect
if (!isset($l)) {
$l = new Text_LanguageDetect();
$l->setNameMode(2); // return ISO 639-1 codes (e.g. "en")
}
$l_result = $l->detect($text_sample, 1);
if (count($l_result) > 0) {
$language = key($l_result);
}
}
} catch (Exception $e) {
//die('error: '.$e);
// do nothing
}
}
if ($language && (strlen($language) < 7)) {
$newitem->addElement('dc:language', $language);
}
}
// add MIME type (if it appeared in our exclusions lists)
if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);
// add effective URL (URL after redirects)
if (isset($effective_url)) {
//TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
//http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-<2D>-25th-March-2012-Special-Program-from-Liari-(Karachi)
//temporary measure: use utf8_encode()
$newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
} else {
$newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
}
// add categories
if ($categories = $item->get_categories()) {
foreach ($categories as $category) {
if ($category->get_label() !== null) {
$newitem->addElement('category', $category->get_label());
}
}
}
// check for enclosures
if ($options->keep_enclosures) {
if ($enclosures = $item->get_enclosures()) {
foreach ($enclosures as $enclosure) {
// thumbnails
foreach ((array)$enclosure->get_thumbnails() as $thumbnail) {
$newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail));
}
if (!$enclosure->get_link()) continue;
$enc = array();
// Media RSS spec ($enc): http://search.yahoo.com/mrss
// SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
$enc['url'] = $enclosure->get_link();
if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
$newitem->addElement('media:content', '', $enc);
}
}
}
$output->addItem($newitem);
unset($html);
$item_count++;
}
// output feed
debug('Done!');
/*
if ($debug_mode) {
$_apc_data = apc_cache_info('user');
var_dump($_apc_data); exit;
}
*/
if (!$debug_mode) {
if ($callback) echo "$callback("; // if $callback is set, $format also == 'json'
if ($format == 'json') $output->setFormat(($callback === null) ? JSON : JSONP);
$add_to_cache = $options->caching;
// is smart cache mode enabled?
if ($add_to_cache && $options->apc && $options->smart_cache) {
// yes, so only cache if this is the second request for this URL
$add_to_cache = ($apc_cache_hits >= 2);
// purge cache
if ($options->cache_cleanup > 0) {
if (rand(1, $options->cache_cleanup) == 1) {
// apc purge code adapted from from http://www.thimbleopensource.com/tutorials-snippets/php-apc-expunge-script
$_apc_data = apc_cache_info('user');
foreach ($_apc_data['cache_list'] as $_apc_item) {
if ($_apc_item['ttl'] > 0 && ($_apc_item['ttl'] + $_apc_item['creation_time'] < time())) {
apc_delete($_apc_item['info']);
}
}
}
}
}
if ($add_to_cache) {
ob_start();
$output->genarateFeed(false);
$output = ob_get_contents();
ob_end_clean();
if ($html_only && $item_count == 0) {
// do not cache - in case of temporary server glitch at source URL
} else {
$cache = get_cache();
if ($add_to_cache) $cache->save($output, $cache_id);
}
echo $output;
} else {
$output->genarateFeed(false);
}
if ($callback) echo ');';
}

View file

@ -1,386 +0,0 @@
<?php
// Autoloading of classes allows us to include files only when they're
// needed. If we've got a cached copy, for example, only Zend_Cache is loaded.
function autoload($class_name) {
static $dir = null;
if ($dir === null) $dir = dirname(__FILE__).'/libraries/';
static $mapping = array(
// Include FeedCreator for RSS/Atom creation
'FeedWriter' => 'feedwriter/FeedWriter.php',
'FeedItem' => 'feedwriter/FeedItem.php',
// Include ContentExtractor and Readability for identifying and extracting content from URLs
'ContentExtractor' => 'content-extractor/ContentExtractor.php',
'SiteConfig' => 'content-extractor/SiteConfig.php',
// Include Humble HTTP Agent to allow parallel requests and response caching
'HumbleHttpAgent' => 'humble-http-agent/HumbleHttpAgent.php',
'SimplePie_HumbleHttpAgent' => 'humble-http-agent/SimplePie_HumbleHttpAgent.php',
'CookieJar' => 'humble-http-agent/CookieJar.php',
// Include Zend Cache to improve performance (cache results)
'Zend_Cache' => 'Zend/Cache.php',
// Language detect
'Text_LanguageDetect' => 'language-detect/LanguageDetect.php',
// HTML5 Lib
'HTML5_Parser' => 'html5/Parser.php',
// htmLawed - used if XSS filter is enabled (xss_filter)
'htmLawed' => 'htmLawed/htmLawed.php'
);
if (isset($mapping[$class_name])) {
debug("** Loading class $class_name ({$mapping[$class_name]})");
require $dir.$mapping[$class_name];
return true;
} else {
return false;
}
}
spl_autoload_register('autoload');
class DummySingleItemFeed {
public $item;
function __construct($url) { $this->item = new DummySingleItem($url); }
public function get_title() { return ''; }
public function get_description() { return 'Content extracted from '.$this->item->url; }
public function get_link() { return $this->item->url; }
public function get_language() { return false; }
public function get_image_url() { return false; }
public function get_items($start=0, $max=1) { return array(0=>$this->item); }
}
class DummySingleItem {
public $url;
function __construct($url) { $this->url = $url; }
public function get_permalink() { return $this->url; }
public function get_title() { return null; }
public function get_date($format='') { return false; }
public function get_author($key=0) { return null; }
public function get_authors() { return null; }
public function get_description() { return ''; }
public function get_enclosure($key=0, $prefer=null) { return null; }
public function get_enclosures() { return null; }
public function get_categories() { return null; }
}
///////////////////////////////
// HELPER FUNCTIONS
///////////////////////////////
// Adapted from WordPress
// http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173
function get_excerpt($text, $num_words=55, $more=null) {
if (null === $more) $more = '&hellip;';
$text = strip_tags($text);
//TODO: Check if word count is based on single characters (East Asian characters)
/*
if (1==2) {
$text = trim(preg_replace("/[\n\r\t ]+/", ' ', $text), ' ');
preg_match_all('/./u', $text, $words_array);
$words_array = array_slice($words_array[0], 0, $num_words + 1);
$sep = '';
} else {
$words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
$sep = ' ';
}
*/
$words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
$sep = ' ';
if (count($words_array) > $num_words) {
array_pop($words_array);
$text = implode($sep, $words_array);
$text = $text.$more;
} else {
$text = implode($sep, $words_array);
}
// trim whitespace at beginning or end of string
// See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2
$text = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $text);
return $text;
}
function url_allowed($url) {
global $options;
if (!empty($options->allowed_urls)) {
$allowed = false;
foreach ($options->allowed_urls as $allowurl) {
if (stristr($url, $allowurl) !== false) {
$allowed = true;
break;
}
}
if (!$allowed) return false;
} else {
foreach ($options->blocked_urls as $blockurl) {
if (stristr($url, $blockurl) !== false) {
return false;
}
}
}
return true;
}
//////////////////////////////////////////////
// Convert $html to UTF8
// (uses HTTP headers and HTML to find encoding)
// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
//////////////////////////////////////////////
function convert_to_utf8($html, $header=null)
{
$encoding = null;
if ($html || $header) {
if (is_array($header)) $header = implode("\n", $header);
if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
// error parsing the response
debug('Could not find Content-Type header in HTTP response');
} else {
$match = end($match); // get last matched element (in case of redirects)
if (isset($match[2])) $encoding = trim($match[2], "\"' \r\n\0\x0B\t");
}
// TODO: check to see if encoding is supported (can we convert it?)
// If it's not, result will be empty string.
// For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
// Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
if (!$encoding || $encoding == 'none') {
// search for encoding in HTML - only look at the first 50000 characters
// Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html
// TODO: improve this so it looks at smaller chunks first
$html_head = substr($html, 0, 50000);
if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html_head, $match)) {
$encoding = trim($match[1], '"\'');
} elseif (preg_match('/<meta\s+http-equiv=["\']?Content-Type["\']? content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i', $html_head, $match)) {
$encoding = trim($match[1]);
} elseif (preg_match_all('/<meta\s+([^>]+)>/i', $html_head, $match)) {
foreach ($match[1] as $_test) {
if (preg_match('/charset=["\']?([^"\']+)/i', $_test, $_m)) {
$encoding = trim($_m[1]);
break;
}
}
}
}
if (isset($encoding)) $encoding = trim($encoding);
// trim is important here!
if (!$encoding || (strtolower($encoding) == 'iso-8859-1')) {
// replace MS Word smart qutoes
$trans = array();
$trans[chr(130)] = '&sbquo;'; // Single Low-9 Quotation Mark
$trans[chr(131)] = '&fnof;'; // Latin Small Letter F With Hook
$trans[chr(132)] = '&bdquo;'; // Double Low-9 Quotation Mark
$trans[chr(133)] = '&hellip;'; // Horizontal Ellipsis
$trans[chr(134)] = '&dagger;'; // Dagger
$trans[chr(135)] = '&Dagger;'; // Double Dagger
$trans[chr(136)] = '&circ;'; // Modifier Letter Circumflex Accent
$trans[chr(137)] = '&permil;'; // Per Mille Sign
$trans[chr(138)] = '&Scaron;'; // Latin Capital Letter S With Caron
$trans[chr(139)] = '&lsaquo;'; // Single Left-Pointing Angle Quotation Mark
$trans[chr(140)] = '&OElig;'; // Latin Capital Ligature OE
$trans[chr(145)] = '&lsquo;'; // Left Single Quotation Mark
$trans[chr(146)] = '&rsquo;'; // Right Single Quotation Mark
$trans[chr(147)] = '&ldquo;'; // Left Double Quotation Mark
$trans[chr(148)] = '&rdquo;'; // Right Double Quotation Mark
$trans[chr(149)] = '&bull;'; // Bullet
$trans[chr(150)] = '&ndash;'; // En Dash
$trans[chr(151)] = '&mdash;'; // Em Dash
$trans[chr(152)] = '&tilde;'; // Small Tilde
$trans[chr(153)] = '&trade;'; // Trade Mark Sign
$trans[chr(154)] = '&scaron;'; // Latin Small Letter S With Caron
$trans[chr(155)] = '&rsaquo;'; // Single Right-Pointing Angle Quotation Mark
$trans[chr(156)] = '&oelig;'; // Latin Small Ligature OE
$trans[chr(159)] = '&Yuml;'; // Latin Capital Letter Y With Diaeresis
$html = strtr($html, $trans);
}
if (!$encoding) {
debug('No character encoding found, so treating as UTF-8');
$encoding = 'utf-8';
} else {
debug('Character encoding: '.$encoding);
if (strtolower($encoding) != 'utf-8') {
debug('Converting to UTF-8');
$html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
}
}
}
return $html;
}
function makeAbsolute($base, $elem) {
$base = new SimplePie_IRI($base);
// remove '//' in URL path (used to prevent URLs from resolving properly)
// TODO: check if this is still the case
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
$elems = $elem->getElementsByTagName($tag);
for ($i = $elems->length-1; $i >= 0; $i--) {
$e = $elems->item($i);
//$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
makeAbsoluteAttr($base, $e, $attr);
}
if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr);
}
}
function makeAbsoluteAttr($base, $e, $attr) {
if ($e->hasAttribute($attr)) {
// Trim leading and trailing white space. I don't really like this but
// unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
$url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
$url = str_replace(' ', '%20', $url);
if (!preg_match('!https?://!i', $url)) {
if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
$e->setAttribute($attr, $absolute);
}
}
}
}
function makeAbsoluteStr($base, $url) {
$base = new SimplePie_IRI($base);
// remove '//' in URL path (causes URLs not to resolve properly)
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
if (preg_match('!^https?://!i', $url)) {
// already absolute
return $url;
} else {
if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
return $absolute;
}
return false;
}
}
// returns single page response, or false if not found
function getSinglePage($item, $html, $url) {
global $http, $extractor;
debug('Looking for site config files to see if single page link exists');
$site_config = $extractor->buildSiteConfig($url, $html);
$splink = null;
if (!empty($site_config->single_page_link)) {
$splink = $site_config->single_page_link;
} elseif (!empty($site_config->single_page_link_in_feed)) {
// single page link xpath is targeted at feed
$splink = $site_config->single_page_link_in_feed;
// so let's replace HTML with feed item description
$html = $item->get_description();
}
if (isset($splink)) {
// Build DOM tree from HTML
$readability = new Readability($html, $url);
$xpath = new DOMXPath($readability->dom);
// Loop through single_page_link xpath expressions
$single_page_url = null;
foreach ($splink as $pattern) {
$elems = @$xpath->evaluate($pattern, $readability->dom);
if (is_string($elems)) {
$single_page_url = trim($elems);
break;
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
foreach ($elems as $item) {
if ($item instanceof DOMElement && $item->hasAttribute('href')) {
$single_page_url = $item->getAttribute('href');
break 2;
} elseif ($item instanceof DOMAttr && $item->value) {
$single_page_url = $item->value;
break 2;
}
}
}
}
// If we've got URL, resolve against $url
if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
// check it's not what we have already!
if ($single_page_url != $url) {
// it's not, so let's try to fetch it...
$_prev_ref = $http->referer;
$http->referer = $single_page_url;
if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
$http->referer = $_prev_ref;
return $response;
}
$http->referer = $_prev_ref;
}
}
}
return false;
}
// based on content-type http header, decide what to do
// param: HTTP headers string
// return: array with keys: 'mime', 'type', 'subtype', 'action', 'name'
// e.g. array('mime'=>'image/jpeg', 'type'=>'image', 'subtype'=>'jpeg', 'action'=>'link', 'name'=>'Image')
function get_mime_action_info($headers) {
global $options;
// check if action defined for returned Content-Type
$info = array();
if (preg_match('!^Content-Type:\s*(([-\w]+)/([-\w\+]+))!im', $headers, $match)) {
// look for full mime type (e.g. image/jpeg) or just type (e.g. image)
// match[1] = full mime type, e.g. image/jpeg
// match[2] = first part, e.g. image
// match[3] = last part, e.g. jpeg
$info['mime'] = strtolower(trim($match[1]));
$info['type'] = strtolower(trim($match[2]));
$info['subtype'] = strtolower(trim($match[3]));
foreach (array($info['mime'], $info['type']) as $_mime) {
if (isset($options->content_type_exc[$_mime])) {
$info['action'] = $options->content_type_exc[$_mime]['action'];
$info['name'] = $options->content_type_exc[$_mime]['name'];
break;
}
}
}
return $info;
}
function remove_url_cruft($url) {
// remove google analytics for the time being
// regex adapted from http://navitronic.co.uk/2010/12/removing-google-analytics-cruft-from-urls/
// https://gist.github.com/758177
return preg_replace('/(\?|\&)utm_[a-z]+=[^\&]+/', '', $url);
}
function make_substitutions($string) {
if ($string == '') return $string;
global $item, $effective_url;
$string = str_replace('{url}', htmlspecialchars($item->get_permalink()), $string);
$string = str_replace('{effective-url}', htmlspecialchars($effective_url), $string);
return $string;
}
function get_cache() {
global $options, $valid_key;
static $cache = null;
if ($cache === null) {
$frontendOptions = array(
'lifetime' => 10*60, // cache lifetime of 10 minutes
'automatic_serialization' => false,
'write_control' => false,
'automatic_cleaning_factor' => $options->cache_cleanup,
'ignore_user_abort' => false
);
$backendOptions = array(
'cache_dir' => ($valid_key) ? $options->cache_dir.'/rss-with-key/' : $options->cache_dir.'/rss/', // directory where to put the cache files
'file_locking' => false,
'read_control' => true,
'read_control_type' => 'strlen',
'hashed_directory_level' => $options->cache_directory_level,
'hashed_directory_perm' => 0777,
'cache_file_perm' => 0664,
'file_name_prefix' => 'ff'
);
// getting a Zend_Cache_Core object
$cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
}
return $cache;
}
function debug($msg) {
global $debug_mode;
if ($debug_mode) {
echo '* ',$msg,"\n";
ob_flush();
flush();
}
}
function get_base_url($dom) {
$xpath = new DOMXPath($dom);
$base_url = @$xpath->evaluate('string(//head/base/@href)', $dom);
if ($base_url !== '') {
return $base_url;
} else {
return false;
}
}

View file

@ -1,6 +0,0 @@
Full-Text RSS Site Patterns
---------------------------
Site patterns allow you to specify what should be extracted from specific sites.
Please see http://help.fivefilters.org/customer/portal/articles/223153-site-patterns for more information.

View file

@ -1,6 +0,0 @@
title: //title
body: //h2 | //span[@class='masque'] | //article[@class='corps_article_right']
prune: no
tidy: no
test_url: http://www.bfmtv.com/societe/cigarette-electronique-dangers-588622.html

View file

@ -1,45 +0,0 @@
# Author: zinnober
tidy: no
prune: no
# Set author
author: //a[@rel='author']
# Set date
date: //span[@class='Datum']
# Content is here
body: //div[@class='Artikel']
# Tidy up before article
strip: //div[@id='FAZHeaderNeu']
strip: //h2[@itemprop='headline']
strip: //span[@class='Datum']
strip: //span[@class='Autor']
strip_id_or_class: ArticlePagerTop
strip: //div[@class='FAZArtikelEinleitung']/h2
# General cleanup
strip: //div[@class='clear']
strip: //span[@class='Bildnachweis']
strip: //iframe
strip_id_or_class: Community
strip: ' · '
# Remove tracking and ads
strip_image_src: /l.gif?
strip: //img[@width='1']
strip_id_or_class: invisible
strip_id_or_class: Anzeige
strip_id_or_class: billboard
# Remove clutter after article
strip_id_or_class: Tagline
strip_id_or_class: ArtikelAbbinder
strip_id_or_class: FAZArtikelKommentare
strip_id_or_class: ArtikelKommentieren
strip_id_or_class: FAZContentRight
# Try it yourself
test_url: http://blogs.faz.net/wost/2014/08/17/viel-fuck-und-wenig-guter-sex-1239/

View file

@ -1,12 +0,0 @@
title: //title
body: //iframe
replace_string(<![CDATA[): _
replace_string(]]>): _
single_page_link: //link[@type='application/xml+oembed']
prune: no
tidy: no
http://www.dailymotion.com/video/x1vk5oh_before-they-were-on-game-of-thrones_people

View file

@ -1,4 +0,0 @@
title: //div[contains(@class, 'SB_Title')]//a
body: //div[contains(@class, 'STR_Content')]
test_url: http://dilbert.com/strips/comic/2013-10-22

View file

@ -1,3 +0,0 @@
<?php
// this is here to prevent directory listing over the web
?>

View file

@ -1,4 +0,0 @@
title: //title
body: //div[contains(@class, 'block')]
test_url: http://www.interviewmagazine.com/film/spike-jonze

View file

@ -1,6 +0,0 @@
title: //h2
body: div[@id='illustration'] | //p
prune: no
tidy: no
test_url: http://mobile.lemondeinformatique.fr/actualites/lire-les-datacenters-d-apple-google-et-facebook-eco-responsables-selon-greenpeace-le-monde-informatique-57122.html

View file

@ -1,4 +0,0 @@
title: //title
body: //div[@id='question']//div[contains(@class,'post-text')] | //div[@id='answers-header']//h2 | //div[contains(@class,'accepted-answer')]//div[contains(@class,'post-text')]
test_url: http://cstheory.stackexchange.com/questions/14811/what-is-the-enlightenment-im-supposed-to-attain-after-studying-finite-automata/14818#14818

View file

@ -1,4 +0,0 @@
title: //title
body: //div[@id='question']//div[contains(@class,'post-text')] | //div[@id='answers-header']//h2 | //div[contains(@class,'accepted-answer')]//div[contains(@class,'post-text')]
test_url: http://stackoverflow.com/questions/20302422/calling-a-function-from-a-javascript-object

View file

@ -1,11 +0,0 @@
title: //title
body: //div[@class='talk-article__body talk-transcript__body'] | //div[@class='media__image media__image--thumb talk-link__image']
strip_id_or_class: talk-transcript__para__time
single_page_link: //a[@id='hero-transcript-link']
#prune: no
tidy: no
test_url: http://www.ted.com/talks/andrew_solomon_how_the_worst_moments_in_our_lives_make_us_who_we_are

View file

@ -1,6 +0,0 @@
title: //title
body: //h2 | //p | //ul
prune: no
tidy: no
test_url: http://www.tldp.org/HOWTO/Plug-and-Play-HOWTO-7.html

View file

@ -1,2 +0,0 @@
<?php
// this is here to prevent directory listing over the web

View file

@ -1,14 +0,0 @@
body: //div[@id='articlebody']
title: //h1
author: //p[@id='by']//a
next_page_link: //span[@class='next']/a
# Not the same as below!
prune: yes
tidy: no
# Annoying 'next' links plainly inside the article body
strip: //*[text()[contains(.,'Next: ')]]
test_url: http://psychology.about.com/od/theoriesofpersonality/ss/defensemech.htm

View file

@ -1,19 +0,0 @@
title: //h1[@id='firstHeading']
body: //div[@id = 'bodyContent']
strip_id_or_class: editsection
#strip_id_or_class: toc
strip_id_or_class: vertical-navbox
strip: //table[@id='toc']
strip: //div[@id='catlinks']
strip: //div[@id='jump-to-nav']
strip: //div[@class='thumbcaption']//div[@class='magnify']
strip: //table[@class='navbox']
strip: //table[contains(@class, 'infobox')]
strip: //div[@class='dablink']
strip: //div[@id='contentSub']
strip: //table[contains(@class, 'metadata')]
strip: //*[contains(@class, 'noprint')]
strip: //span[@title='pronunciation:']
prune: no
tidy: no
test_url: http://en.wikipedia.org/wiki/Christopher_Lloyd

View file

@ -1,6 +0,0 @@
title: //div[@class='meta']/h2/a
author: //div[@class='meta']/h2/following-sibling::p/a/text()
date://div[@class='meta']/h2/strong
body: //div[@id='article']
strip: //div[@class='domore']
test_url: http://24ways.org/2011/composing-the-new-canon

View file

@ -1,8 +0,0 @@
title: //h1[contains(@class, 'entry-title')]
date: //meta[@name='weibo: article:create_at']/@content
body: //div[contains(@class, 'mainContent')]
strip_id_or_class: related_topics
prune: no
test_url: http://www.36kr.com/p/207879.html

View file

@ -1,6 +0,0 @@
title: //div[@class='post_header']//h2/a
author: //span[@class='author']
date: //span[@class='date']
body: //div[@id='Content']
test_url: http://37signals.com/svn/posts/2785-the-end-of-the-it-department

View file

@ -1,9 +0,0 @@
body: //div[@class='content']
date: //div[@class='content']/h2
strip: //div[@class='content']/h2
title: //div[@class='content']/h3
strip: //div[@id='postmenu']
strip: //div[@class='trackback']
tidy: no
test_url: http://www.3quarksdaily.com/3quarksdaily/2012/01/martin-luther-king-i-have-a-dream.html

View file

@ -1,11 +0,0 @@
body: //div[@id='main']
title: //div[@class='intro']/h1
author: //ul[@class='text-data']/li[@class='author']
date: //ul[@class='text-data']/li[@class='date']
convert_double_br_tags: yes
tidy: no
strip: //div[@class='share']
strip: //*[@class='zoom']
strip: //div[@id='disqus_thread']
test_url: http://3voor12.vpro.nl/nieuws/2012/januari/Ook-website-GroenLinks-woensdag-op-zwart-i-v-m--SOPA.html

View file

@ -1,4 +0,0 @@
body: //*[@class = 'content']
author: //*[@class = 'submitted']/a
date: substring-after(//*[@class = 'submitted']/text(), '|')
test_url: http://www.43folders.com/2011/04/22/cranking

View file

@ -1,27 +0,0 @@
# very loose setup for both 500px.com/photo/* and 500px.com/blog/*
# photo page example: http://500px.com/photo/4181666
# blog page example: http://500px.com/blog/110
# avoid "no text" error
tidy:no
prune:no
# reorganize photo page elements
#body://div[contains(@class,'container')]
move_into(body)://div[contains(@id,'thephoto')]
move_into(body)://div[contains(@id,'description')]
move_into(body)://div[contains(@id,'tags')]
move_into(body)://div[contains(@id,'photo-info')]
# clean photo page info
strip://span[contains(@id,'copyright')]
strip://*[contains(@id,'store')]
strip://*[contains(@id,'user-info')]
strip://*[contains(@id,'photo-stats')]
strip://*[contains(@id,'voting_controls_container')]
strip://*[contains(@id,'more-photos')]
strip://*[contains(@id,'embed-photo')]
# clean blog page side bar
strip://*[contains(@class,'col d3 clearafter')]
test_url: http://500px.com/photo/3641041?from=editors

View file

@ -1,2 +0,0 @@
title: //meta[@property='og:title']/@content
test_url: http://www.512pixels.net/blog/2014/10/the-move

View file

@ -1,9 +0,0 @@
body: //*[@id="episode"]
prune: no
tidy: no
autodetect_next_page: no
strip_id_or_class: player
strip://*[@id="header"]
test_url: http://5by5.tv/buildanalyze/60

View file

@ -1,7 +0,0 @@
title: //*[@id='sstitle']
body: //div[@id='sstory']
strip_id_or_class: newsoptions
prune: no
test_url: http://www.7newsbelize.com/sstory.php?nid=25654
test_url: http://www.7newsbelize.com/7news.xml

View file

@ -1,9 +0,0 @@
title: //h2[@class='border']
body: //div[@class='padding']
convert_double_br_tags: yes
strip: //div[@id='social_sharing']
strip: //div[@class='socialLinks']
test_url: http://www.944.com/articles/mild-obsessions-frock-la-get-to-know-victoria-tik-s-haute-sustainable-fashion-line/

View file

@ -1,40 +0,0 @@
Full-Text RSS site config files
================
[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no rules are found, it tries to detect the content block automatically.
This repository contains the site-specific extraction rules we rely on in Full-Text RSS.
### Contributing changes
We run automated tests on these files to detect issues. If you'd like to help keep these up to date, please look at the [test results](http://siteconfig.fivefilters.org/test/) and see which files you'd like to contribute fixes for.
We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface.
You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model:
> The Fork & Pull Model lets anyone fork an existing repository and push changes to their personal fork without requiring access be granted to the source repository. The changes must then be pulled into the source repository by the project maintainer. This model reduces the amount of friction for new contributors and is popular with open source projects because it allows people to work independently without upfront coordination.
When we receive a pull request we'll review the changes and if everything's okay we'll update our copy.
If a site is not in our set, you can create a file for it in the same way. See [Creating files on GitHub](https://github.com/blog/1327-creating-files-on-github).
### How to write a site config file
The quickest and simplest way is to use our [point-and-click interface](http://siteconfig.fivefilters.org). It's a simple tool only intended to create a rule to extract the correct content block.
For further refinements, e.g. selecting the title, stripping elements, dealing with multi-page articles, please see our [help page](http://help.fivefilters.org/customer/portal/articles/223153-site-patterns).
### Instapaper
When we introduced site patterns, we chose to adopt the [same format](http://blog.instapaper.com/post/730281947) used by Instapaper. This allows us to make use of the existing extraction rules contributed by Instapaper users.
Marco, Instapaper's creator, graciously opened up the database of contributions to everyone:
> And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached.
Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (no longer available since Instapaper was sold).
### Testing site config files
Currently you will have to have a copy of Full-Text RSS to test changes to the site config files. In the future we will try to make this process easier.

View file

@ -1,10 +0,0 @@
title: //meta[@property='og:title']/@content
body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
strip_id_or_class: socialshareprivacy1
strip_id_or_class: zvaFacebookButton
tidy: no
prune: no
test_url: http://www.aachener-nachrichten.de/lokales/aachen-detail-an/2517757

View file

@ -1,10 +0,0 @@
title: //meta[@property='og:title']/@content
body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
strip_id_or_class: socialshareprivacy1
strip_id_or_class: zvaFacebookButton
tidy: no
prune: no
test_url: http://www.aachener-zeitung.de/sixcms/detail.php?template=az_detail&id=2552718

View file

@ -1,7 +0,0 @@
title: //meta[@property='og:title']/@content
body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text' or @itemprop='articleBody']
strip_id_or_class: colB
prune: no
test_url: http://www.abc.es/20120209/tv-series/abci-house-ultima-temporada-201202090936.html

View file

@ -1,18 +0,0 @@
title: //div[@class='article section']//h1
author: //div[@class="byline"]/a
date: //span[@class="timestamp"]
body: //div[@class="page section"]
strip: //a[@class="inline-caption"]
strip: //p[@class="ticker section noprint"]
strip: //p[@class="topics"]
strip: //h1
strip: //div[@class="byline"]
strip: //p[@class="published"]
strip: //div[contains(@class,"featured-scroller")]
strip_id_or_class: footer
tidy: no
test_url: http://www.abc.net.au/news/2013-03-27/open-speed-highways-change-clp-giles/4597892
test_url: http://www.abc.net.au/news/2013-04-30/credit-growth-remains-subdued/4660054?section=business

View file

@ -1,27 +0,0 @@
title: //h1[@class='headline']
body: //div[@id='storyText']
# for video entries
body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')]
author: //div[@class='byline']
date: //div[@class='date']
strip: //*[@id='date_partner']
strip: //div[@class='breadcrumb']
strip: //div[contains(@class,'show_tools')]
strip: //div[@id='sponsoredByAd']
strip: //div[contains(@class,'rel_container')]
strip: //p[a[starts-with(@href, 'http://www.twitter.com')]]
strip: //p[a[starts-with(@href, 'http://www.facebook.com')]]
strip: //p[contains(., 'Click here to return to')]
#strip_id_or_class: media
strip_id_or_class: mediaplayer
replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http
prune: no
single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true')
test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744
# multi-page
test_url: http://abcnews.go.com/Blotter/family-freed-american-hostage-somalia-seals-obama/story?id=15439544

View file

@ -1,9 +0,0 @@
title: //div[@id='H_docTitle']
body: //div[@id='H_meta' or @id='H_content' or @id='F_footer']
strip_id_or_class: F_toenail
prune: no
test_url: http://www.accesstoinsight.org/lib/authors/nyanaponika/wheel026.html

View file

@ -1,3 +0,0 @@
body: //div[starts-with(@id, 'news-id-')]
test_url: http://acidcow.com/fun/20933-acid-picdump-83-pics.html

View file

@ -1,9 +0,0 @@
title://h1[@class="title"]
author://div[@class="submitted"]/span/a
date://div[@class="submitted"]/span
body://div[@class="content-wrapper"]
strip://div[@id="skip-link"]
strip://div[@id="region-content-3-3"]
strip://div[@id="section-footer"]
test_url: https://www.acquia.com/blog/drupals-long-warmth-toward-third-party-code

View file

@ -1,5 +0,0 @@
tidy:no
date: //time[@class='updated']
dissolve: //ul[@class='video-gallery']/li
dissolve: //ul[@class='video-gallery']
test_url: http://www.acroswing.fr/actualites/competition_rock/selectif_bellegarde_sur_valserine__2012-02-26.php

View file

@ -1,6 +0,0 @@
# Generated by FiveFilters.org's web-based selection tool
# Place this file inside your site_config/custom/ folder
# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.adme.ru%2Ftvorchestvo-hudozhniki%2Fprostoj-kak-5-kopeek-hudozhnik-557405%2F
body: //article[contains(concat(' ',normalize-space(@class),' '),' article ')]
test_url: http://www.adme.ru/tvorchestvo-hudozhniki/prostoj-kak-5-kopeek-hudozhnik-557405/

View file

@ -1,5 +0,0 @@
title: //h1[@class='articleTitle ']
body: //div[@class='bodyText widget storyContent']
strip: //p/span[@class='quote']/..
strip_id_or_class: 'pull1'
test_url: https://www.aftenposten.no/meninger/spaltister/Portrett-av-scenekunstneren-som-ung-mann-7167959.html

View file

@ -1,13 +0,0 @@
author: //article//address[contains(@class, 'author')]
body: //article[.//div[contains(@class, 'abBodyText')]]//*[contains(@class, 'abLeadText') or contains(@class, 'abBodyText') or contains(@class, 'abImageBlock') or contains(@class, 'abIGSatellite')]
strip: //address//img
strip: //footer
strip_id_or_class: abSticky
prune: no
test_url: http://www.aftonbladet.se/sportbladet/hockey/sverige/allsvenskan/article17498194.ab
test_url: http://www.aftonbladet.se/debatt/article16207536.ab
test_url: http://www.aftonbladet.se/debatt/debattamnen/politik/article17483377.ab
test_url: http://www.aftonbladet.se/rss.xml

View file

@ -1,15 +0,0 @@
body: //div[@id='content']
# clean up recipe pages
strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']
#recipe pages
strip_id_or_class: "recipe-feedback"
strip_id_or_class: "comments"
strip_id_or_class: "procedure-number"
strip_id_or_class: "more-with-author"
#slice
strip_id_or_class: "inner"
test_url: http://aht.seriouseats.com/archives/2009/12/the-burger-lab-salting-ground-beef.html

View file

@ -1,6 +0,0 @@
body: //div[@id='main-column']//div[@class='content']
prune: no
test_url: http://www.albayan.ae/across-the-uae/education/2013-08-29-1.1949645
test_url: http://www.albayan.ae/1.448?ot=ot.AjaxPageLayout

View file

@ -1,2 +0,0 @@
body: //div[@class="entry"]
test_url: http://alex.mullr.net/blog/2011/05/on-spotify/

View file

@ -1,4 +0,0 @@
body: //section[@class='content']
date: //span[1]
author: //h1[@id='sitetitle']
test_url: http://alexduner.com/blog/something-i-learned-today

View file

@ -1,4 +0,0 @@
body: //section[@class='content']
date: //span[1]
author: //h1[@id='sitetitle']
test_url: https://alexduner.squarespace.com/blog/2013/1/tech-culture-from-the-outside-looking-in

View file

@ -1,12 +0,0 @@
title: //h1[@class='title']
author: //h3[@class='byline']/a
date: //div[@class='ishinfo']
body: //*[@id='articletext']
strip_id_or_class: 'ishinfo'
strip_id_or_class: 'metastuff'
strip_id_or_class: 'learnmore'
strip_id_or_class: 'discuss'
prune: no
test_url: http://www.alistapart.com/articles/organizing-mobile/

View file

@ -1,8 +0,0 @@
title: //span[@id='DetailedTitle']
body: //td[@id='tdTextContent']
strip_id_or_class: Skyscrapper_Body
date: //span[@id='ctl00_cphBody_lblDate']
author: //div[@id="dvAuthorInfo"]//a/text()
strip: //table[ tbody/tr/td/object ]
prune: no
test_url: http://www.aljazeera.com/indepth/opinion/2012/01/2012114121925380575.html

View file

@ -1,14 +0,0 @@
title: //h1[@id='itemTitle']
body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')]
strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right']
strip: //div[contains(@class, 'rightcoltoolsdiv')]
strip: //div[contains(@class, 'servings-form')]
strip: //p[@class='nutritional-information']
strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')]
strip: //div[@id='nutri-info']/div[contains(@class, 'title')]
strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter']
strip_id_or_class: eshaAttribute
strip_id_or_class: eshaParagraph
prune: no
test_url: http://allrecipes.com/Recipe/Taco-Pie/Detail.aspx?src=rotd

View file

@ -1,13 +0,0 @@
title://div[@class="article-title"]/h1[@class="title"]
date: //p[@class="article-date"]
body://div[contains(@class, "article-body")]
# Trim out related posts at bottom of article
strip://blockquote[@class="memo"]
tidy: no
# Yup, no idea why author won't work...
author://div[@class="page-header article-header clearfix"]/p[@class="title"]
# [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it.
test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/
test_url: http://allthingsd.com/20131010/google-cio-ben-fried-on-how-google-works/

View file

@ -1,8 +0,0 @@
title: //div[@id='pageHdr']//h1
body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint']
strip: //div[contains(@class, 'infoBox') or @id='infoBox']
single_page_link: //li[@id='print']/a
prune: no
test_url: http://www.allyou.com/budget-home/money-shopping/freebies-online-00400000066392/

View file

@ -1,11 +0,0 @@
body: //div[@class = 'entry']
date: substring-after(//p[@class="date"],'بتاريخ ')
strip_id_or_class: date
strip_id_or_class: follow-single
strip_id_or_class: ratingblock
strip_id_or_class: newRatingHolder
strip_id_or_class: postmetadata
strip_id_or_class: addthis_toolbox
strip_id_or_class: addthis_default_style
strip_id_or_class: size-full
test_url: http://alphabeta.argaam.com/?p=35657

View file

@ -1,9 +0,0 @@
body: //div[@id = "article-view"]
body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')]
author: //p[@class = "author"]
strip: //h1
strip: //h2
strip_id_or_class: author
prune: no
test_url: http://www.alriyadh.com/2011/10/10/article674357.html
test_url: http://www.alriyadh.com/net/article/780935

View file

@ -1,2 +0,0 @@
title: //*[@id='normalfontyellow']
test_url: http://www.alseraj.net/cgi-bin/pros/av/LeqaTextDisplay.cgi?display&2

View file

@ -1,2 +0,0 @@
body: //*[(@class = "historia")]
test_url: http://alt1040.com/2011/09/banda-ancha-en-america-latina-insignificante

View file

@ -1,4 +0,0 @@
single_page_link: //div[contains(@class, 'story_tools')]//a[contains(@href, '/print/')]
test_url: http://www.alternet.org/civil-liberties/noam-chomsky-surveillance-state-beyond-imagination-being-created-one-freest
test_url: http://feeds.feedblitz.com/alternet

View file

@ -1,2 +0,0 @@
body: //*[(@class = "historia")]
test_url: http://altfoto.com/2011/09/nikon-presenta-su-nuevo-sistema-nikon-1-y-dos-nuevas-camaras

View file

@ -1,10 +0,0 @@
title: //h1
author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ")
date: //div/a[contains (@href, "issue")]
move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1]
body: //div[@class="enableBullets"]
test_url: http://alumni.stanford.edu/get/page/magazine/article/?article_id=54819

View file

@ -1,6 +0,0 @@
body: //div[@id='content']//div[contains(@class, 'content')]
strip_id_or_class: widget
strip: //a[contains(@href, 'upm_export=')]
test_url: http://amandala.com.bz/news/feed/
test_url: http://amandala.com.bz/news/poor-pse-results-30-raise/

View file

@ -1,19 +0,0 @@
title: //span[@id = 'btAsinTitle']
body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div
#strip_id_or_class: quantityDropdownDiv
#strip_id_or_class: addToCartSpan
#strip_id_or_class: oneClickDiv
strip_id_or_class: nocontent
strip_id_or_class: masDynamicConten
strip_id_or_class: dynamic-content
prune: no
find_string: <span id="actualPriceValue">
replace_string: <span id="actualPriceValue"><br />Price:
strip_id_or_class: collapsePS
strip_id_or_class: expandPS
strip_id_or_class: psPlaceHolde
strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')]
test_url: http://www.amazon.com/Common-Sense-Forestry-Living-Mother/dp/1931498210/

View file

@ -1,6 +0,0 @@
title: //div[@class='head']/h2/a
author: //div[@class='head']/a
date: //div[@class='head']/p[@class='date']/a
body: //div[@class='copy']
strip: //p[@class='meta']
test_url: http://americandrink.net/post/10567188712/free-the-hooch

View file

@ -1,10 +0,0 @@
title: //div[@class="editorial-content"]/h3
body: //div[@class="hero-image" or @class="editorial-content"]
strip: //ul[@class="hero-caption"]
strip_id_or_class: footer
prune: no
tidy: no
test_url: http://www.americascup.com/en/Latest/News/2012/3/Coutts-and-Peyron-tell-transformative-tale-at-Global-Sports-Forum/

View file

@ -1,5 +0,0 @@
title: //h1[@class="post-title"]
author: //span[@class="author"]/a
date: //span[@class="date"]
body: //div[@class="post-content main"]
test_url: http://www.americastestkitchenfeed.com/gadgets-and-gear/2012/07/chill-out-with-tovolos-king-cube-silicone-ice-cube-tray/

View file

@ -1,8 +0,0 @@
title: //title
body: //div[@class="entry-content"]
author: //span[@class="author vcard"]
date: //span[@class="entry-date"]
test_url: http://www.amptoons.com/blog/2013/03/14/open-thread-and-link-farm-i-hate-being-sick-edition/

View file

@ -1,15 +0,0 @@
body: //section[@class='main_cont']/img | //div[@class='articleContent']
title: //div[@class='blog_top_left']//h2
author: //a[@class='b'][1]
date: substring-after(substring-before(//div, 'Posted in'), ' on ')
strip_image_src: /content/images/globals/
strip: //h2[. = 'Page 1']/preceding::p
strip: //h2
prune: no
single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/'))
test_url: http://www.anandtech.com/show/8370/gigabyte-am1m-s2h-review
test_url: http://www.anandtech.com/show/8402/sandisk-releases-ultra-ii-ssd-the-second-tlc-nand-ssd-in-the-market
test_url: http://www.anandtech.com/show/8400/arms-cortex-m-even-smaller-and-lower-power-cpu-cores

View file

@ -1,5 +0,0 @@
body: //div[@class='post_content']
date: //div[@class='date_day'] | div[@class='date_month']
test_url: http://www.androidpolice.com/2014/03/30/music-boss-for-pebble-can-now-control-playback-and-volume-on-chromecast-content-from-your-smartwatch/

View file

@ -1,9 +0,0 @@
title: //h2
author: string('Andy Rutledge')
date: //div[@class='articledate']
body: //div[@class='copybody']
strip: //*[@class='space']
strip: //*[@class='articleFoot']
test_url: http://www.andyrutledge.com/hungry-for-a-better-menu.php

View file

@ -1,9 +0,0 @@
title: //h1[@class="title"]
author: ("Anna Manasova")
# is ignored, unfortunately
date: //p[@class="date"]
body: //div[@class="entry"]
test_url: http://annatravelling.wordpress.com/2011/11/07/a-day-of-cooking-thai/

View file

@ -1,23 +0,0 @@
# Author: zinnober
prune: no
title: substring-before(//div[@id='content']/h1, ',')
single_page_link: //a[@title='Seite drucken']
body: //div[@id='detail-body']
replace_string(<span class="description">): <em>
replace_string(<p class="leadtext"><small>): <p class="leadtext">
# Fix headlines
replace_string(Patrick Hollstein): &nbsp;
replace_string(APOTHEKE ADHOC): &nbsp;
replace_string(dpa): &nbsp;
replace_string(Katharina Lübke): &nbsp;
replace_string(Julia Pradel): &nbsp;
replace_string(Franziska Gerhardt): &nbsp;
test_url: http://www.apotheke-adhoc.de/nachrichten/politik/nachricht-detail-politik/deutscher-apothekertag-antraege-gegen-lieferengpaesse-2/

Some files were not shown because too many files have changed in this diff Show more