pax_global_header 0000666 0000000 0000000 00000000064 13772136612 0014522 g ustar 00root root 0000000 0000000 52 comment=12743e852020677df4e9f62c5effc5c69dde11cb
tweeper-1.4.3/ 0000775 0000000 0000000 00000000000 13772136612 0013202 5 ustar 00root root 0000000 0000000 tweeper-1.4.3/HACKING 0000664 0000000 0000000 00000001206 13772136612 0014170 0 ustar 00root root 0000000 0000000 The code follows the Drupal coding standards:
https://www.drupal.org/coding-standards
Style compliance can be checked using the Coder Sniffer extension to the PEAR
PHP_CodeSniffer project, for instructions about how to install Coder Sniffer
see https://www.drupal.org/node/1419988
TL;DR: install drupla/coder and enable the Drupal coding standard in
PHP_CodeSniffer:
$ composer global require drupal/coder
$ export PATH="$HOME/.config/composer/vendor/bin:$PATH"
$ phpcs --config-set installed_paths $HOME/.config/composer/vendor/drupal/coder/coder_sniffer/
And then use this command to check the style:
$ phpcs --standard=Drupal .
tweeper-1.4.3/INSTALL 0000664 0000000 0000000 00000001454 13772136612 0014237 0 ustar 00root root 0000000 0000000 The recommended way to install tweeper globally is to install all its files
under /usr/share/php/tweeper and then make a symlink to the wrapper script
"tweeper" under /usr/bin
NOTES FOR PACKAGERS
Even though the php json extensions are used, namely json_decode(), a direct
dependency on php-json is not usually strictly necessary, because (at least on
Debian) php-cli already depends on it.
Tweeper depends on php-symfony-serializer which is used to convert json to xml
for some sites which provide the timeline data in json rather than in directly
transformable html.
Tweeper also depends (indirectly) on php-symfony-property-access because the
code relies on the ObjectNormalizer class which requires the PropertyAccess
component, see
http://symfony.com/doc/current/components/serializer.html#installation
tweeper-1.4.3/Makefile 0000664 0000000 0000000 00000001512 13772136612 0014641 0 ustar 00root root 0000000 0000000 # Packagers may want to override this!
prefix ?= /usr/local
PHP_SCRIPT_DIR ?= $(prefix)/share/php
BIN_DIR := $(prefix)/bin
MAN_DIR := $(prefix)/share/man
TWEEPER_DIR := $(PHP_SCRIPT_DIR)/tweeper
all:
clean:
rm -f tweeper.1
docs:
a2x -f manpage tweeper.1.asciidoc
installdocs: docs
install -d $(DESTDIR)$(MAN_DIR)/man1
install -m644 tweeper.1 $(DESTDIR)$(MAN_DIR)/man1
install: installdocs
install -d $(DESTDIR)$(TWEEPER_DIR)
install -m644 *.php $(DESTDIR)$(TWEEPER_DIR)
install -m755 tweeper $(DESTDIR)$(TWEEPER_DIR)
install -d $(DESTDIR)$(TWEEPER_DIR)/src
install -m644 src/* $(DESTDIR)$(TWEEPER_DIR)/src
install -d $(DESTDIR)$(BIN_DIR)
ln -rsf $(DESTDIR)$(TWEEPER_DIR)/tweeper $(DESTDIR)$(BIN_DIR)/tweeper
@echo -e "\n\nINSTALLATION COMPLETE"
@echo -e "Make sure '$(DESTDIR)$(PHP_SCRIPT_DIR)' is in PHP include_path!\n"
tweeper-1.4.3/NEWS 0000664 0000000 0000000 00000012346 13772136612 0013707 0 ustar 00root root 0000000 0000000 News for v1.4.3:
================
* Fix scraping twitter.com again by impersonating a Google crawler
* Add check for http response code and return failure for error codes
* Return failure when instagram.com redirects to login page
News for v1.4.2:
================
* Add option to enable or disable showing verbose output
* Add back partial support for twitter.com using the old twitter mobile UI
* Misc fixes to code and documentation
News for v1.4.1:
================
* Enable cookie handling in cURL to fix scraping twitter.com
* Update User-Agent version to fix scraping hashtag pages on twitter.com
News for v1.4.0:
================
* Make the images adapt to the screen width in feed readers which render the
HTML data in the description.
* Indicate if there is a GIF image in a tweet.
* Add option to enable or disable showing usernames in RSS items.
* Retry multiple times to retrieve a resource before giving up.
* Fix coding style.
* Add option to enable or disable showing multimedia content in RSS items.
* Fix generating enclosures for Dilbert.com
* Make enclosure elements validate with feedvalidator.org when the server
does not provide a Content-Length header.
News for v1.3.0:
================
* Fix scraping instagram.com
* Fix scraping twitter.com
* Improve scraping twitter.com hashtag pages, like for example
https://twitter.com/hashtag/tweeper
* Fix getting the channel logo URL for identi.ca/pump.io
* Add support for scraping Instagram hashtag pages, like for example
https://www.instagram.com/explore/tags/marechiaro
* Make the RSS feed for twitter.com hashtag pages validate with
feedvalidator.org
News for v1.2.0:
================
* Add support for scraping Instagram location pages, like for example
https://www.instagram.com/explore/locations/833277432/
* Make scraping Instagram.com more robust
* Improve and fix scraping Facebook.com pages once again
* Add support for Twitter.com permalink URLs
* Make the generated Twitter.com feed mach more closely the original
content, now spaces and line wrap are preserved in feed reader which can
render the HTML code embedded in the element, this way
ASCII art tweets can be fully appreciated when read via tweeper.
Check out https://twitter.com/sarahjeong/status/955651919279722496
News for v1.1.0:
================
* Make scraping Facebook.com pages more robust
* Fix getting the channel image for Facebook.com pages
* Add some development tools
* Fix a problem with some feed readers when showing images from Twitter.com
by ignoring the "style" attribute in the scraped HTML
* Filter out promoted tweets when scraping Twitter.com
* Remove support for Howtoons.com, the old blog is not available anymore
News for v1.0.0:
================
* Support "application/octet-stream" as an enclosure content type
* Support "application/pdf" as an enclosure content type
* Fix information leakage by validating the URL scheme
* Code restructuring to make it easier to use tweeper as a library in other
projects
* Allow installing tweeper via composer, the packagist page is at:
https://packagist.org/packages/ao2/tweeper
* Misc robustness fixes
News for v0.6:
==============
* Fix support for Facebook.com public pages
* Fix support for Dilbert.com
* Major code cleanup (coding style, functions naming)
* Fix indentation when generating the element
* Support generating enclosure for "image/png" links
* Major improvements for Twitter.com:
- embed images directly in the item description, linking to the original
versions uploaded by the user;
- use direct links instead of ones pointing to the t.co redirector;
- show explicitly if the attached media is a video;
- add enclosure element support for attached images.
* Minor improvements for Instagram.com:
- fix the channel link;
- make images adapt to the feed reader view, this avoids horizontal
scrolling if the image is too big.
* Support generating enclosure for images on Dilbert.com
* Support generating enclosure for images on Pump.io sites
* Misc fixes to code and documentation
News for v0.5:
==============
* Use the Symfony Serializer component instead of the PEAR XML_Serializer
* Make the Twitter stylesheet stricter to avoid empty timeline entries
News for v0.4:
==============
* Make the generated RSS validate with feedvalidator.org
* Fix support for Dilbert.com
* Add support for Instragram.com
* Add support for public pages on Facebook.com
* Make tweeper work with the PHP built-in web server
* Misc fixes to code and documentation
News for v0.3:
==============
* Support generating enclosure for "audio/ogg" links
* Always specify xml:base to improve local URLs expansions in some cases
* Support both the classic and the new Twitter profile pages
* Fix getting the profile picture of Twitter users
* Add support for Howtoons.com
News for v0.2:
==============
* Small fixes to the man page which must be in the coming Debian package
News for v0.1:
==============
* Initial version supporting Twitter.com
* Add support for pump.io sites
* Add support for dilbert.com
* Show links to supported media files in the RSS element
tweeper-1.4.3/README 0000664 0000000 0000000 00000004213 13772136612 0014062 0 ustar 00root root 0000000 0000000 Tweeper is a web scraper which can be used to conveniently follow the public
activity of social network users without the need to log in or even be
subscribed to the social network; tweeper converts the public information to
RSS so that it can be accessed and collected by a feed reader.
Since Jun 11th 2013, when Twitter.com retired their API v1.0, it has not been
possible anymore to access a user timeline via RSS, and it has also become
mandatory to authenticate via OAuth to access this _public_ information in the
JSON format:
https://dev.twitter.com/discussions/16289
https://dev.twitter.com/discussions/11564
Some services came up to overcome this "problem":
http://twss.55uk.net/
http://twitter-rss.com/ (now redirecting to google.com)
http://rssitfor.me
However these solutions are still shady and let no control to the user about
who collects the information about the visited user timelines.
This is why Tweeper[1] was born, as an Open Source way to keep following your
friends with a certain degree of anonymity, without having to tell Twitter.com
whom you are friend to.
[1] http://www.urbandictionary.com/define.php?term=TWEEPER&defid=3743173
Tweeper can easily scrape sites other than Twitter, it is just a matter of
writing an xsl stylesheet for the transformation; an example for pump.io
activity stream is provided in rss_converter_pump.io.xsl
The currently supported sites are:
* Twitter.com
* Pump.io based websites, like Identi.ca
* Dilbert.com
* Instagram.com
* Facebook.com (public pages)
Tweeper can be used via web or as a command line program, for example as
a filter in your feed reader, by passing the URL of the user's public timeline
as the first argument.
Example of use on the command line:
$ php tweeper.php https://twitter.com/NSACareers
Example of use as a Liferea[2] filter:
$ liferea-add-feed "|php .../path_to_tweeper/tweeper.php https://twitter.com/NSAcareers"
Example of use with identi.ca:
$ liferea-add-feed "|php .../path_to_tweeper/tweeper.php http://identi.ca/evan"
[2] http://lzone.de/liferea/
Tweeper is licensed under the GPLv3 or later.
Tweeper was written by Antonio Ospite https://ao2.it
tweeper-1.4.3/TODO 0000664 0000000 0000000 00000000776 13772136612 0013704 0 ustar 00root root 0000000 0000000 - write better XSL stylesheets? I am not an XSL expert
- evaluate the use of the RSS element
- show cards directly in RSS items for twitter.com
- show direct links for videos in the Instagram feed
- check the encoding of the tweets when UTF is used,
maybe solvable with mb_convert_encoding()?
See http://php.net/manual/en/domdocument.loadhtml.php
- The dependencies on the symphony components in composer.json could be more
relaxed like ">=2.7.0", but for now sticking to "2.7.*" is good enough.
tweeper-1.4.3/autoload.php 0000664 0000000 0000000 00000005474 13772136612 0015535 0 ustar 00root root 0000000 0000000
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
$package_name = 'ao2/tweeper';
if (file_exists(__DIR__ . '/vendor/autoload.php')) {
/*
* If "composer install" has been executed, use the composer autoloader.
*
* Using __DIR__ is OK as long as this file is on the same level of the
* project "vendor/" directory (usually the project root directory).
*/
require __DIR__ . '/vendor/autoload.php';
}
elseif (preg_match('/' . preg_quote('/vendor/' . $package_name, '/') . '$/', __DIR__)) {
/*
* If running from a "vendor/" directory of another project use the
* autoloader of the parent project.
*
* This covers the case of running from a symlink in ./vendor/bin/ because
* __DIR__ contains the *real path* of this file.
*
* Note that using __DIR__ here and going back two levels is OK under the
* assumptions that this file is in the project root directory, and that the
* package name has the structure VENDOR/PROJECT_NAME.
*/
require __DIR__ . '/../../autoload.php';
}
else {
/*
* Otherwise, run without composer:
*
* 1. register our own autoloader function for the Tweeper class
*
* The implementation follows the one suggested in:
* http://www.php-fig.org/psr/psr-4/
*/
spl_autoload_register(function ($fully_qualified_class_name) {
/* This matches the data defined for the PSR-4 autoloader in composer.json */
$namespace_prefix = 'Tweeper\\';
$base_directory = 'src/';
$len = strlen($namespace_prefix);
if (strncmp($namespace_prefix, $fully_qualified_class_name, $len) !== 0) {
return;
}
$class_relative = substr($fully_qualified_class_name, $len);
$file_path = $base_directory . str_replace('\\', '/', $class_relative) . '.php';
require_once $file_path;
});
/*
* 2. load the system-wide autoloader from php-symphony-serializer
*
* This allows to run tweeper without composer, as long as the Symphony
* dependencies are available system-wide.
*
* For example, the Debian package takes care of that.
*/
require_once 'Symfony/Component/Serializer/autoload.php';
}
tweeper-1.4.3/composer.json 0000664 0000000 0000000 00000001514 13772136612 0015725 0 ustar 00root root 0000000 0000000 {
"name": "ao2/tweeper",
"type": "library",
"description": "Tweeper is a web scraper to convert popular social media sites to RSS (e.g. Twitter.com, Instagram.com).",
"keywords": ["Twitter", "Instagram", "Facebook", "RSS", "scraper"],
"homepage": "https://git.ao2.it/tweeper.git",
"license": "GPL-3.0+",
"authors": [
{
"name": "Antonio Ospite",
"email": "ao2@ao2.it",
"homepage": "https://ao2.it",
"role": "Developer"
}
],
"require": {
"php": ">=5.3.0",
"ext-curl": "*",
"ext-dom": "*",
"ext-json": "*",
"ext-xsl": "*",
"symfony/serializer": ">=2.7.0",
"symfony/property-access": ">=2.7.0"
},
"autoload": {
"psr-4": { "Tweeper\\": "src/" }
},
"bin": ["tweeper"]
}
tweeper-1.4.3/src/ 0000775 0000000 0000000 00000000000 13772136612 0013771 5 ustar 00root root 0000000 0000000 tweeper-1.4.3/src/Tweeper.php 0000664 0000000 0000000 00000033127 13772136612 0016123 0 ustar 00root root 0000000 0000000
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
use DOMDocument;
use XSLTProcessor;
use Symfony\Component\Serializer\Serializer;
use Symfony\Component\Serializer\Encoder\XmlEncoder;
use Symfony\Component\Serializer\Normalizer\ObjectNormalizer;
date_default_timezone_set('UTC');
/**
* Scrape supported websites and perform conversion to RSS.
*/
class Tweeper {
private static $userAgent = "APIs-Google (+https://developers.google.com/webmasters/APIs-Google.html)";
private static $maxConnectionTimeout = 5;
private static $maxConnectionRetries = 5;
/**
* Create a new Tweeper object controlling optional settings.
*
* @param bool $generate_enclosure
* Enables the creation of elements (disabled by default).
* @param bool $show_usernames
* Enables showing the username in front of the content for multi-user
* sites (enabled by default). Only some stylesheets supports this
* functionality (twitter, instagram, pump.io).
* @param bool $show_multimedia
* Enables showing multimedia content (images, videos) directly in the
* item description (enabled by default). Only some stylesheets supports
* this functionality (twitter, instagram, dilbert).
* @param bool $verbose_output
* Enables showing non-fatal errors like XML parsing errors.
*/
public function __construct($generate_enclosure = FALSE, $show_usernames = TRUE, $show_multimedia = TRUE, $verbose_output = TRUE) {
$this->generate_enclosure = $generate_enclosure;
$this->show_usernames = $show_usernames;
$this->show_multimedia = $show_multimedia;
$this->verbose_output = $verbose_output;
}
/**
* Convert numeric Epoch to the date format expected in a RSS document.
*/
public static function epochToRssDate($timestamp) {
if (!is_numeric($timestamp) || is_nan($timestamp)) {
$timestamp = 0;
}
return gmdate(DATE_RSS, $timestamp);
}
/**
* Convert generic date string to the date format expected in a RSS document.
*/
public static function strToRssDate($date) {
$timestamp = strtotime($date);
if (FALSE === $timestamp) {
$timestamp = 0;
}
return Tweeper::epochToRssDate($timestamp);
}
/**
* Convert string to UpperCamelCase.
*/
public static function toUpperCamelCase($str, $delim = ' ') {
$str_upper = ucwords($str, $delim);
$str_camel_case = str_replace($delim, '', $str_upper);
return $str_camel_case;
}
/**
* Perform a cURL session multiple times when it fails with a timeout.
*
* @param resource $ch
* a cURL session handle.
*/
private static function curlExec($ch) {
$ret = FALSE;
$attempt = 0;
do {
$ret = curl_exec($ch);
if (FALSE === $ret) {
trigger_error(curl_error($ch), E_USER_WARNING);
}
} while (curl_errno($ch) == CURLE_OPERATION_TIMEDOUT && ++$attempt < Tweeper::$maxConnectionRetries);
$response_code = curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
if (FALSE === $response_code) {
trigger_error(curl_error($ch), E_USER_WARNING);
return FALSE;
}
if ($response_code >= 400) {
trigger_error("HTTP reponse code $response_code", E_USER_WARNING);
return FALSE;
}
return $ret;
}
/**
* Get the contents from a URL.
*/
private static function getUrlContents($url, $user_agent = NULL) {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_HEADER => FALSE,
CURLOPT_CONNECTTIMEOUT => Tweeper::$maxConnectionTimeout,
// Follow http redirects to get the real URL.
CURLOPT_FOLLOWLOCATION => TRUE,
CURLOPT_COOKIEFILE => "",
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_HTTPHEADER => ['Accept-language: en'],
CURLOPT_USERAGENT => isset($user_agent) ? $user_agent : Tweeper::$userAgent,
]);
$contents = Tweeper::curlExec($ch);
curl_close($ch);
return $contents;
}
/**
* Get the headers from a URL.
*/
private static function getUrlInfo($url, $user_agent = NULL) {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_HEADER => TRUE,
CURLOPT_NOBODY => TRUE,
CURLOPT_CONNECTTIMEOUT => Tweeper::$maxConnectionTimeout,
// Follow http redirects to get the real URL.
CURLOPT_FOLLOWLOCATION => TRUE,
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_USERAGENT => isset($user_agent) ? $user_agent : Tweeper::$userAgent,
]);
$ret = Tweeper::curlExec($ch);
if (FALSE === $ret) {
curl_close($ch);
return FALSE;
}
$url_info = curl_getinfo($ch);
if (FALSE === $url_info) {
trigger_error(curl_error($ch), E_USER_WARNING);
}
curl_close($ch);
return $url_info;
}
/**
* Generate an RSS element.
*/
public static function generateEnclosure($url) {
$supported_content_types = [
"application/octet-stream",
"application/ogg",
"application/pdf",
"audio/aac",
"audio/mp4",
"audio/mpeg",
"audio/ogg",
"audio/vorbis",
"audio/wav",
"audio/webm",
"audio/x-midi",
"image/gif",
"image/jpeg",
"image/png",
"video/avi",
"video/mp4",
"video/mpeg",
"video/ogg",
];
$url_info = Tweeper::getUrlInfo($url);
if (FALSE === $url_info) {
trigger_error("Failed to retrieve info for URL: " . $url, E_USER_WARNING);
return '';
}
$supported = in_array($url_info['content_type'], $supported_content_types);
if (!$supported) {
trigger_error("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url'], E_USER_WARNING);
return '';
}
// The RSS specification says that the enclosure element URL must be http.
// See http://sourceforge.net/p/feedvalidator/bugs/72/
$http_url = preg_replace("/^https/", "http", $url_info['url']);
// When the server does not provide a Content-Length header,
// curl_getinfo() would return a negative value for
// "download_content_length", however RSS recommends to use 0 when the
// enclosure's size cannot be determined.
// See: https://www.feedvalidator.org/docs/error/UseZeroForUnknown.html
$length = max($url_info['download_content_length'], 0);
$dom = new DOMDocument();
$enc = $dom->createElement('enclosure');
$enc->setAttribute('url', $http_url);
$enc->setAttribute('length', $length);
$enc->setAttribute('type', $url_info['content_type']);
return $enc;
}
/**
* Mimic the message from libxml.c::php_libxml_ctx_error_level()
*/
private static function logXmlError($error) {
$output = "";
switch ($error->level) {
case LIBXML_ERR_WARNING:
$output .= "Warning $error->code: ";
break;
case LIBXML_ERR_ERROR:
$output .= "Error $error->code: ";
break;
case LIBXML_ERR_FATAL:
$output .= "Fatal Error $error->code: ";
break;
}
$output .= trim($error->message);
if ($error->file) {
$output .= " in $error->file";
}
else {
$output .= " in Entity,";
}
$output .= " line $error->line";
trigger_error($output, E_USER_WARNING);
}
/**
* Convert json to XML.
*/
private static function jsonToXml($json, $root_node_name) {
// Apparently the ObjectNormalizer used afterwards is not able to handle
// the stdClass object created by json_decode() with the default setting
// $assoc = false; so use $assoc = true.
$data = json_decode($json, $assoc = TRUE);
if (!$data) {
return NULL;
}
$encoder = new XmlEncoder();
$normalizer = new ObjectNormalizer();
$serializer = new Serializer([$normalizer], [$encoder]);
$serializer_options = [
'xml_encoding' => "UTF-8",
'xml_format_output' => TRUE,
'xml_root_node_name' => $root_node_name,
];
$xml_data = $serializer->serialize($data, 'xml', $serializer_options);
if (!$xml_data) {
trigger_error("Cannot serialize data", E_USER_WARNING);
return NULL;
}
return $xml_data;
}
/**
* Convert the Instagram content to XML.
*/
private function getXmlInstagramCom($html) {
// Extract the json data from the html code.
$json_match_expr = '/window._sharedData = (.*);/';
$ret = preg_match($json_match_expr, $html, $matches);
if ($ret !== 1) {
trigger_error("Cannot match expression: $json_match_expr\n", E_USER_WARNING);
return NULL;
}
$data = json_decode($matches[1], $assoc = TRUE);
// The "qe" object contains elements which will result in invalid XML
// element names, so remove it.
unset($data["qe"]);
// The "knobs" object contains elements with undefined namespaces, so
// remove it to silence an error message.
unset($data["knobs"]);
// Stop here in case Instagram redirected to the login page, this can
// happen when too many consecutive requests have been made from the same
// IP.
if (array_key_exists("LoginAndSignupPage", $data["entry_data"])) {
trigger_error("Cannot open instagram page: redirected to Login page.\n", E_USER_WARNING);
return NULL;
}
$json = json_encode($data);
return Tweeper::jsonToXml($json, 'instagram');
}
/**
* Make the Facebook HTML processable.
*/
private function preprocessHtmlFacebookCom($html) {
$html = str_replace('', '', $html);
return $html;
}
/**
* Convert the HTML retrieved from the site to XML.
*/
private function htmlToXml($html, $host) {
$xmlDoc = new DOMDocument();
// Handle warnings and errors when loading invalid HTML.
$xml_errors_value = libxml_use_internal_errors(TRUE);
// If there is a host-specific method to get the XML data, use it!
$get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.');
if (method_exists($this, $get_xml_host_method)) {
$xml_data = call_user_func_array([$this, $get_xml_host_method], [$html]);
if (NULL === $xml_data) {
return NULL;
}
$xmlDoc->loadXML($xml_data);
}
else {
$xmlDoc->loadHTML($html);
}
if ($this->verbose_output) {
foreach (libxml_get_errors() as $xml_error) {
Tweeper::logXmlError($xml_error);
}
}
libxml_clear_errors();
libxml_use_internal_errors($xml_errors_value);
return $xmlDoc;
}
/**
* Load a stylesheet if the web site is supported.
*/
private function loadStylesheet($host) {
$stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl";
if (FALSE === file_exists($stylesheet)) {
trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_WARNING);
return NULL;
}
$stylesheet_contents = file_get_contents($stylesheet);
if (FALSE === $stylesheet_contents) {
trigger_error("Cannot open $stylesheet", E_USER_WARNING);
return NULL;
}
$xslDoc = new DOMDocument();
$xslDoc->loadXML($stylesheet_contents);
$xsltProcessor = new XSLTProcessor();
$xsltProcessor->registerPHPFunctions();
$xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure);
$xsltProcessor->setParameter('', 'show-usernames', $this->show_usernames);
$xsltProcessor->setParameter('', 'show-multimedia', $this->show_multimedia);
$xsltProcessor->importStylesheet($xslDoc);
return $xsltProcessor;
}
/**
* Convert the site content to RSS.
*/
public function tweep($src_url, $host = NULL, $validate_scheme = TRUE) {
$url = parse_url($src_url);
if (FALSE === $url) {
trigger_error("Invalid URL: $src_url", E_USER_WARNING);
return NULL;
}
if (TRUE === $validate_scheme) {
$scheme = $url["scheme"];
if (!in_array($scheme, ["http", "https"])) {
trigger_error("unsupported scheme: $scheme", E_USER_WARNING);
return NULL;
}
}
// If the host is not given derive it from the URL.
if (NULL === $host) {
if (empty($url["host"])) {
trigger_error("Invalid host in URL: $src_url", E_USER_WARNING);
return NULL;
}
// Strip the leading www. to be more forgiving on input URLs.
$host = preg_replace('/^www\./', '', $url["host"]);
}
$xsltProcessor = $this->loadStylesheet($host);
if (NULL === $xsltProcessor) {
return NULL;
}
$html = Tweeper::getUrlContents($src_url);
if (FALSE === $html) {
trigger_error("Failed to retrieve $src_url", E_USER_WARNING);
return NULL;
}
$preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.');
if (method_exists($this, $preprocess_html_host_method)) {
$html = call_user_func_array([$this, $preprocess_html_host_method], [$html]);
}
$xmlDoc = $this->htmlToXml($html, $host);
if (NULL === $xmlDoc) {
return NULL;
}
$output = $xsltProcessor->transformToXML($xmlDoc);
if (FALSE === $output) {
trigger_error('XSL transformation failed.', E_USER_WARNING);
return NULL;
}
return $output;
}
}
tweeper-1.4.3/src/rss_converter_dilbert.com.xsl 0000664 0000000 0000000 00000012431 13772136612 0021702 0 ustar 00root root 0000000 0000000
...<![CDATA[]]>Tweeper
tweeper-1.4.3/src/rss_converter_facebook.com.xsl 0000664 0000000 0000000 00000013426 13772136612 0022033 0 ustar 00root root 0000000 0000000
https://facebook.com...<![CDATA[]]>Tweeper<![CDATA[]]>
tweeper-1.4.3/src/rss_converter_identi.ca.xsl 0000777 0000000 0000000 00000000000 13772136612 0026425 2rss_converter_pump.io.xsl ustar 00root root 0000000 0000000 tweeper-1.4.3/src/rss_converter_instagram.com.xsl 0000664 0000000 0000000 00000020721 13772136612 0022243 0 ustar 00root root 0000000 0000000
https://instagram.com...<![CDATA[
(Video)
]]>Tweeper<![CDATA[]]>
tweeper-1.4.3/src/rss_converter_pump.io.xsl 0000664 0000000 0000000 00000011370 13772136612 0021070 0 ustar 00root root 0000000 0000000
<![CDATA[]]>Tweeper
tweeper-1.4.3/src/rss_converter_twitter.com.xsl 0000664 0000000 0000000 00000025351 13772136612 0021764 0 ustar 00root root 0000000 0000000
https://twitter.com(Video) <![CDATA[ (Video) (GIF)white-space: pre-wrap;]]>Tweeper
tweeper-1.4.3/tests/ 0000775 0000000 0000000 00000000000 13772136612 0014344 5 ustar 00root root 0000000 0000000 tweeper-1.4.3/tests/fetch_facebook_page.sh 0000775 0000000 0000000 00000001046 13772136612 0020622 0 ustar 00root root 0000000 0000000 #!/bin/sh
#
# Facebook requires a CAPTCHA most of the times, so keep fetching the URL as
# long as needed, until the page is shown with no CAPTCHA.
set -e
USER_AGENT="Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
while true;
do
# Force language to en-us to make sure that the string matching works
OUTPUT=$(wget -nv --user-agent="$USER_AGENT" --header='Accept-Language: en-us' -O - -- "$1")
if echo $OUTPUT | grep -q -v "Security Check Required";
then
echo "$OUTPUT" > facebook.html
break
fi
sleep 5
done
tweeper-1.4.3/tests/instument_to_catch_promoted_tweets.diff 0000664 0000000 0000000 00000001264 13772136612 0024377 0 ustar 00root root 0000000 0000000 diff --git a/src/Tweeper.php b/src/Tweeper.php
index 8ac2fe3..c45aab5 100644
--- a/src/Tweeper.php
+++ b/src/Tweeper.php
@@ -355,6 +355,15 @@ class Tweeper {
$html = call_user_func_array(array($this, $preprocess_html_host_method), array($html));
}
+ // XXX REMOVE: instrumentation to catch promoted tweets
+ if ($host == "twitter.com") {
+ $twitter_promoted_match_expr = '/promoted/i';
+ $ret = preg_match($twitter_promoted_match_expr, $html, $matches);
+ if ($ret) {
+ file_put_contents("/home/ao2/TWITTER_PROMOTED_DUMP.html", $html);
+ }
+ }
+
$xmlDoc = $this->htmlToXml($html, $host);
if (NULL === $xmlDoc) {
return NULL;
tweeper-1.4.3/tests/test_information_leakage.sh 0000775 0000000 0000000 00000002020 13772136612 0021732 0 ustar 00root root 0000000 0000000 #!/bin/sh
set -e
TWEEPER="/usr/share/php/tweeper/tweeper"
#TWEEPER="./tweeper"
check_result() {
URL="$1"
FILE="$2"
RESULT="$3"
echo "URL $URL"
if [ "$RESULT" ];
then
echo "--> $FILE"
echo " exists"
else
echo "... $FILE"
echo " does not exist"
fi
echo
}
file_exists() {
FILE="$1"
URL="file://twitter.com/$FILE"
OUTPUT=$($TWEEPER $URL)
check_result "$URL" "$FILE" "$OUTPUT"
}
file_exists_on_server() {
SERVER="$1"
FILE="$2"
URL="file://twitter.com/$FILE"
OUTPUT=$(curl $SERVER/tweeper.php?src_url=$URL 2> /dev/null)
check_result "$URL" "$FILE on $SERVER" "$OUTPUT"
}
file_exists /etc/passwd || true
file_exists /etc/file_with_an_unlikely_name || true
echo "Staring a test server"
echo
php -S localhost:8000 -t $(dirname $TWEEPER) > /dev/null 2>&1 &
SERVER_PID=$!
sleep 1
file_exists_on_server http://localhost:8000 /etc/passwd || true
file_exists_on_server http://localhost:8000 /etc/file_with_an_unlikely_name || true
echo "Shutting down the test server"
kill $SERVER_PID
tweeper-1.4.3/tests/tweeper_file 0000775 0000000 0000000 00000000647 13772136612 0016753 0 ustar 00root root 0000000 0000000 #!/usr/bin/env php
\n";
if ($argc < 3) {
fwrite(STDERR, $usage);
exit(1);
}
$file_url = 'file://' . realpath($argv[1]);
$host = $argv[2];
$tweeper = new Tweeper();
$output = $tweeper->tweep($file_url, $host, false);
if (is_null($output)) {
exit(1);
}
echo $output;
tweeper-1.4.3/tweeper 0000775 0000000 0000000 00000001132 13772136612 0014600 0 ustar 00root root 0000000 0000000 #!/usr/bin/env php
element
*-m <0|1>*::
enable or disable showing multimedia content (e.g. Twitter or Instagram
pictures) directly inside the item description. Default is 1 (enable).
*-u <0|1>*::
enable or disable showing usernames in front of the item for hosts which
supports it (Twitter.com/Instagram.com). Default is 1 (enable).
*-v <0|1>*::
enable or disable showing verbose output like, for instance, non-fatal
errors and warnings from the XML parser. Default is 1 (enable).
*-h, --help*::
show the help message
EXAMPLE OF USE
--------------
Getting the RSS feed of some Twitter user:
tweeper https://twitter.com/NSACareers
Using tweeper as a filter for the Liferea feed reader:
liferea-add-feed "|tweeper https://twitter.com/NSAcareers"
To use tweeper via web there are two options (the examples assume the
installation directory to be `/usr/share/php/tweeper/`):
1. Using the PHP built-in web server:
php -S localhost:8000 -t /usr/share/php/tweeper/
+
and then visit 'http://localhost:8000/tweeper.php' in the web browser.
2. Using a generic web server with the document root in '/var/www':
sudo ln -s /usr/share/php/tweeper/tweeper.php /var/www
xdg-open http://localhost/tweeper.php?src_url=http://twitter.com/NSAcareers
+
It is enough to create the symlink only the very first time tweeper is used
this way.
NOTES
-----
In order to use tweeper with a symlink with the apache 'userdir' module, the
'SymLinksIfOwnerMatch' option must be replaced by 'FollowSymlink' in
/etc/apache2/mods-enabled/userdir.conf
EXIT STATUS
-----------
*0*::
Success
*!0*::
Failure
AUTHORS
-------
Antonio Ospite
RESOURCES
---------
Main web site:
COPYING
-------
Copyright \(C) 2013-2020 Antonio Ospite
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
tweeper-1.4.3/tweeper.php 0000664 0000000 0000000 00000010632 13772136612 0015370 0 ustar 00root root 0000000 0000000
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
require_once 'autoload.php';
use Tweeper\Tweeper;
date_default_timezone_set('UTC');
/**
* Check if the script is being run from the command line.
*/
function is_cli() {
return (php_sapi_name() === "cli");
}
/**
* Show the script usage.
*/
function usage($argv) {
if (is_cli()) {
$usage = "{$argv[0]} [-e|-m <0|1>|-u <0|1>|-v <0|1>|-h|--help] \n";
}
else {
$usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=&generate_enclosure=<0|1>&show_usernames=<0|1>&show_multimedia=<0|1>&verbose_output=<0|1>");
}
return "usage: $usage";
}
/**
* Parse command line options.
*/
function parse_options_cli($argv, $argc) {
$options = [
'generate_enclosure' => FALSE,
'show_usernames' => TRUE,
'show_multimedia' => TRUE,
'verbose_output' => TRUE,
];
if ($argc < 2) {
return $options;
}
$cli_options = getopt("em:u:v:h", ["help"]);
foreach ($cli_options as $opt => $val) {
switch ($opt) {
case 'e':
$options['generate_enclosure'] = TRUE;
break;
case 'm':
$ret = filter_var($val, FILTER_VALIDATE_BOOLEAN, FILTER_NULL_ON_FAILURE);
if (NULL === $ret) {
fwrite(STDERR, "Invalid argument for the -m option.\n");
fwrite(STDERR, usage($argv));
exit(1);
}
$options['show_multimedia'] = $val;
break;
case 'u':
$ret = filter_var($val, FILTER_VALIDATE_BOOLEAN, FILTER_NULL_ON_FAILURE);
if (NULL === $ret) {
fwrite(STDERR, "Invalid argument for the -u option.\n");
fwrite(STDERR, usage($argv));
exit(1);
}
$options['show_usernames'] = $val;
break;
case 'v':
$ret = filter_var($val, FILTER_VALIDATE_BOOLEAN, FILTER_NULL_ON_FAILURE);
if (NULL === $ret) {
fwrite(STDERR, "Invalid argument for the -v option.\n");
fwrite(STDERR, usage($argv));
exit(1);
}
$options['verbose_output'] = $val;
break;
case 'h':
case 'help':
echo usage($argv);
exit(0);
default:
fwrite(STDERR, usage($argv));
exit(1);
}
}
// For now assume that the URL is the lest argument, in the future we could
// switch to PHP >= 7.1 and use the $optind argument of getopt().
$options['src_url'] = array_pop($argv);
return $options;
}
/**
* Parse options passed from a query string.
*/
function parse_options_query_string() {
$options = [
'generate_enclosure' => FALSE,
'show_usernames' => TRUE,
'show_multimedia' => TRUE,
'verbose_output' => TRUE,
];
if (isset($_GET['src_url'])) {
$options['src_url'] = $_GET['src_url'];
}
if (isset($_GET['generate_enclosure'])) {
$options['generate_enclosure'] = $_GET['generate_enclosure'] == 1;
}
if (isset($_GET['show_multimedia'])) {
$options['show_multimedia'] = $_GET['show_multimedia'] != 0;
}
if (isset($_GET['show_usernames'])) {
$options['show_usernames'] = $_GET['show_usernames'] != 0;
}
if (isset($_GET['verbose_output'])) {
$options['verbose_output'] = $_GET['verbose_output'] != 0;
}
return $options;
}
if (is_cli()) {
$options = parse_options_cli($argv, $argc);
$error_stream = fopen('php://stderr', 'w');
}
else {
$options = parse_options_query_string();
$error_stream = fopen('php://output', 'w');
}
if (!isset($options['src_url'])) {
fwrite($error_stream, usage(is_cli() ? $argv : NULL));
exit(1);
}
$tweeper = new Tweeper($options['generate_enclosure'], $options['show_usernames'], $options['show_multimedia'], $options['verbose_output']);
$output = $tweeper->tweep($options['src_url']);
if (is_null($output)) {
exit(1);
}
echo $output;