pax_global_header 0000666 0000000 0000000 00000000064 13244302333 0014507 g ustar 00root root 0000000 0000000 52 comment=c36da4f46b23dea2b762cdde9b0e9514251e29c5
tweeper-1.2.0/ 0000775 0000000 0000000 00000000000 13244302333 0013162 5 ustar 00root root 0000000 0000000 tweeper-1.2.0/HACKING 0000664 0000000 0000000 00000001206 13244302333 0014150 0 ustar 00root root 0000000 0000000 The code follows the Drupal coding standards:
https://www.drupal.org/coding-standards
Style compliance can be checked using the Coder Sniffer extension to the PEAR
PHP_CodeSniffer project, for instructions about how to install Coder Sniffer
see https://www.drupal.org/node/1419988
TL;DR: install drupla/coder and enable the Drupal coding standard in
PHP_CodeSniffer:
$ composer global require drupal/coder
$ export PATH="$HOME/.config/composer/vendor/bin:$PATH"
$ phpcs --config-set installed_paths $HOME/.config/composer/vendor/drupal/coder/coder_sniffer/
And then use this command to check the style:
$ phpcs --standard=Drupal .
tweeper-1.2.0/INSTALL 0000664 0000000 0000000 00000001107 13244302333 0014212 0 ustar 00root root 0000000 0000000 The recommended way to install tweeper globally is to install all its files
under /usr/share/php/tweeper and then make a symlink to the wrapper script
"tweeper" under /usr/bin
Tweeper depends on php-symfony-serializer which is used to convert json to xml
for some sites which provide the timeline data in json rather than in usable
html.
NOTE: Tweeper also depends indirectly on php-symfony-property-access because
the code relies on the ObjectNormalizer class which requires the
PropertyAccess component, see
http://symfony.com/doc/current/components/serializer.html#installation
tweeper-1.2.0/Makefile 0000664 0000000 0000000 00000001512 13244302333 0014621 0 ustar 00root root 0000000 0000000 # Packagers may want to override this!
prefix ?= /usr/local
PHP_SCRIPT_DIR ?= $(prefix)/share/php
BIN_DIR := $(prefix)/bin
MAN_DIR := $(prefix)/share/man
TWEEPER_DIR := $(PHP_SCRIPT_DIR)/tweeper
all:
clean:
rm -f tweeper.1
docs:
a2x -f manpage tweeper.1.asciidoc
installdocs: docs
install -d $(DESTDIR)$(MAN_DIR)/man1
install -m644 tweeper.1 $(DESTDIR)$(MAN_DIR)/man1
install: installdocs
install -d $(DESTDIR)$(TWEEPER_DIR)
install -m644 *.php $(DESTDIR)$(TWEEPER_DIR)
install -m755 tweeper $(DESTDIR)$(TWEEPER_DIR)
install -d $(DESTDIR)$(TWEEPER_DIR)/src
install -m644 src/* $(DESTDIR)$(TWEEPER_DIR)/src
install -d $(DESTDIR)$(BIN_DIR)
ln -rsf $(DESTDIR)$(TWEEPER_DIR)/tweeper $(DESTDIR)$(BIN_DIR)/tweeper
@echo -e "\n\nINSTALLATION COMPLETE"
@echo -e "Make sure '$(DESTDIR)$(PHP_SCRIPT_DIR)' is in PHP include_path!\n"
tweeper-1.2.0/NEWS 0000664 0000000 0000000 00000007104 13244302333 0013663 0 ustar 00root root 0000000 0000000 News for v1.2.0:
================
* Add support for scraping Instagram location pages, like for example
https://www.instagram.com/explore/locations/833277432/
* Make scraping Instagram.com more robust
* Improve and fix scraping Facebook.com pages once again
* Add support for Twitter.com permalink URLs
* Make the generated Twitter.com feed mach more closely the original
content, now spaces and line wrap are preserved in feed reader which can
render the HTML code embedded in the element, this way
ASCII art tweets can be fully appreciated when read via tweeper.
Check out https://twitter.com/sarahjeong/status/955651919279722496
News for v1.1.0:
================
* Make scraping Facebook.com pages more robust
* Fix getting the channel image for Facebook.com pages
* Add some development tools
* Fix a problem with some feed readers when showing images from Twitter.com
by ignoring the "style" attribute in the scraped HTML
* Filter out promoted tweets when scraping Twitter.com
* Remove support for Howtoons.com, the old blog is not available anymore
News for v1.0.0:
================
* Support "application/octet-stream" as an enclosure content type
* Support "application/pdf" as an enclosure content type
* Fix information leakage by validating the URL scheme
* Code restructuring to make it easier to use tweeper as a library in other
projects
* Allow installing tweeper via composer, the packagist page is at:
https://packagist.org/packages/ao2/tweeper
* Misc robustness fixes
News for v0.6:
==============
* Fix support for Facebook.com public pages
* Fix support for Dilbert.com
* Major code cleanup (coding style, functions naming)
* Fix indentation when generating the element
* Support generating enclosure for "image/png" links
* Major improvements for Twitter.com:
- embed images directly in the item description, linking to the original
versions uploaded by the user;
- use direct links instead of ones pointing to the t.co redirector;
- show explicitly if the attached media is a video;
- add enclosure element support for attached images.
* Minor improvements for Instagram.com:
- fix the channel link;
- make images adapt to the feed reader view, this avoids horizontal
scrolling if the image is too big.
* Support generating enclosure for images on Dilbert.com
* Support generating enclosure for images on Pump.io sites
* Misc fixes to code and documentation
News for v0.5:
==============
* Use the Symfony Serializer component instead of the PEAR XML_Serializer
* Make the Twitter stylesheet stricter to avoid empty timeline entries
News for v0.4:
==============
* Make the generated RSS validate with feedvalidator.org
* Fix support for Dilbert.com
* Add support for Instragram.com
* Add support for public pages on Facebook.com
* Make tweeper work with the PHP built-in web server
* Misc fixes to code and documentation
News for v0.3:
==============
* Support generating enclosure for "audio/ogg" links
* Always specify xml:base to improve local URLs expansions in some cases
* Support both the classic and the new Twitter profile pages
* Fix getting the profile picture of Twitter users
* Add support for Howtoons.com
News for v0.2:
==============
* Small fixes to the man page which must be in the coming Debian package
News for v0.1:
==============
* Initial version supporting Twitter.com
* Add support for pump.io sites
* Add support for dilbert.com
* Show links to supported media files in the RSS element
tweeper-1.2.0/README 0000664 0000000 0000000 00000004202 13244302333 0014040 0 ustar 00root root 0000000 0000000 Tweeper is a web scraper which can be used to conveniently follow the public
activity of social network users without the need to log in or even be
subscribed to the social network; tweeper converts the public information to
RSS so that it can be accessed and collected by a feed reader.
Since Jun 11th 2013, when Twitter.com retired their API v1.0, it has not been
possible anymore to access a user timeline via RSS, and it has also become
mandatory to authenticate via OAuth to access this _public_ information in the
JSON format:
https://dev.twitter.com/discussions/16289
https://dev.twitter.com/discussions/11564
Some services came up to overcome this "problem":
http://twss.55uk.net/
http://twitter-rss.com/ (now redirecting to google.com)
http://rssitfor.me
However these solutions are still shady and let no control to the user about
who collects the information about the visited user timelines.
This is why Tweeper[1] was born, as an Open Source way to keep following your
friends with a certain degree of anonymity, without having to tell Twitter.com
whom you are friend to.
[1] http://www.urbandictionary.com/define.php?term=TWEEPER&defid=3743173
Tweeper can easily scrape sites other than Twitter, it is just a matter of
writing an xsl stylesheet for the transformation; an example for pump.io
activity stream is provided in rss_converter_pump.io.xsl
The currently supported sites are:
* Twitter.com
* Pump.io based websites, like Identi.ca
* Dilbert.com
* Instagram.com
* Facebook.com (public pages)
Tweeper can be used via web or as a command line program, for example as
a filter in your feed reader, by passing the URL of the user's public timeline
as the first argument.
Example of use on the command line:
$ php tweeper.php https://twitter.com/NSACareers
Example of use as a Liferea[2] filter:
$ liferea-add-feed "|php .../path_to_tweeper/tweeper.php https://twitter.com/NSAcareers"
Example of use with identi.ca:
$ liferea-add-feed "|php .../path_to_tweeper/tweeper.php http://identi.ca/evan"
[2] http://lzone.de/liferea/
Tweeper is licensed under the GPLv3.
Tweeper was written by Antonio Ospite https://ao2.it
tweeper-1.2.0/TODO 0000664 0000000 0000000 00000001331 13244302333 0013650 0 ustar 00root root 0000000 0000000 - re-evaluate the use of trigger_error() or use a custom error handler,
because right now the code exists as soon as trigger_error() gets called and
any following code is ignored.
- write better XSL stylesheets? I am not an XSL expert
- evaluate the use of the RSS element
- show cards directly in RSS items for twitter.com
- show direct links for videos in the Instagram feed
- check the encoding of the tweets when UTF is used,
maybe solvable with mb_convert_encoding()?
See http://php.net/manual/en/domdocument.loadhtml.php
- The dependencies on the symphony components in composer.json could be more
relaxed like ">=2.7.0", but for now sticking to "2.7.*" is good enough.
- Add support for instagram tags
tweeper-1.2.0/autoload.php 0000664 0000000 0000000 00000005466 13244302333 0015516 0 ustar 00root root 0000000 0000000
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
$package_name = 'ao2/tweeper';
if (file_exists(__DIR__ . '/vendor/autoload.php')) {
/*
* If "composer install" has been executed, use the composer autoloader.
*
* Using __DIR__ is OK as long as this file is on the same level of the
* project "vendor/" directory (usually the project root directory).
*/
require __DIR__ . '/vendor/autoload.php';
}
elseif (preg_match('/' . preg_quote('/vendor/' . $package_name, '/') . '$/', __DIR__)) {
/*
* If running from a "vendor/" directory of another project use the
* autoloader of the parent project.
*
* This covers the case of running from a symlink in ./vendor/bin/ because
* __DIR__ contains the *real path* of this file.
*
* Note that using __DIR__ here and going back two levels is OK under the
* assumptions that this file is in the project root directory, and that the
* package name has the structure VENDOR/PROJECT_NAME.
*/
require __DIR__ . '/../../autoload.php';
}
else {
/*
* Otherwise, run without composer:
*
* 1. register our own autoloader function for the Tweeper class
*
* The implementation follows the one suggested in:
* http://www.php-fig.org/psr/psr-4/
*/
spl_autoload_register(function ($fully_qualified_class_name) {
/* This matches the data defined for the PSR-4 autoloader in composer.json */
$namespace_prefix = 'Tweeper\\';
$base_directory = 'src/';
$len = strlen($namespace_prefix);
if (strncmp($namespace_prefix, $fully_qualified_class_name, $len) !== 0) {
return;
}
$class_relative = substr($fully_qualified_class_name, $len);
$file_path = $base_directory . str_replace('\\', '/', $class_relative) . '.php';
require_once $file_path;
});
/*
* 2. load the system-wide autoloader from php-symphony-serializer
*
* This allows to run tweeper without composer, as long as the Symphony
* dependencies are available system-wide.
*
* For example, the Debian package takes care of that.
*/
require_once 'Symfony/Component/Serializer/autoload.php';
}
tweeper-1.2.0/composer.json 0000664 0000000 0000000 00000001514 13244302333 0015705 0 ustar 00root root 0000000 0000000 {
"name": "ao2/tweeper",
"type": "library",
"description": "Tweeper is a web scraper to convert popular social media sites to RSS (e.g. Twitter.com, Instagram.com).",
"keywords": ["Twitter", "Instagram", "Facebook", "RSS", "scraper"],
"homepage": "https://git.ao2.it/tweeper.git",
"license": "GPL-3.0+",
"authors": [
{
"name": "Antonio Ospite",
"email": "ao2@ao2.it",
"homepage": "https://ao2.it",
"role": "Developer"
}
],
"require": {
"php": ">=5.3.0",
"ext-curl": "*",
"ext-dom": "*",
"ext-json": "*",
"ext-xsl": "*",
"symfony/serializer": ">=2.7.0",
"symfony/property-access": ">=2.7.0"
},
"autoload": {
"psr-4": { "Tweeper\\": "src/" }
},
"bin": ["tweeper"]
}
tweeper-1.2.0/src/ 0000775 0000000 0000000 00000000000 13244302333 0013751 5 ustar 00root root 0000000 0000000 tweeper-1.2.0/src/Tweeper.php 0000664 0000000 0000000 00000024412 13244302333 0016100 0 ustar 00root root 0000000 0000000
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
use DOMDocument;
use XSLTProcessor;
use Symfony\Component\Serializer\Serializer;
use Symfony\Component\Serializer\Encoder\XmlEncoder;
use Symfony\Component\Serializer\Normalizer\ObjectNormalizer;
date_default_timezone_set('UTC');
/**
* Scrape supported websites and perform conversion to RSS.
*/
class Tweeper {
private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
/**
* Constructor sets up {@link $generate_enclosure}.
*/
public function __construct($generate_enclosure = FALSE) {
$this->generate_enclosure = $generate_enclosure;
}
/**
* Convert numeric Epoch to the date format expected in a RSS document.
*/
public static function epochToRssDate($timestamp) {
if (!is_numeric($timestamp) || is_nan($timestamp)) {
$timestamp = 0;
}
return gmdate(DATE_RSS, $timestamp);
}
/**
* Convert generic date string to the date format expected in a RSS document.
*/
public static function strToRssDate($date) {
$timestamp = strtotime($date);
if (FALSE === $timestamp) {
$timestamp = 0;
}
return Tweeper::epochToRssDate($timestamp);
}
/**
* Convert string to UpperCamelCase.
*/
public static function toUpperCamelCase($str, $delim = ' ') {
$str_upper = ucwords($str, $delim);
$str_camel_case = str_replace($delim, '', $str_upper);
return $str_camel_case;
}
/**
* Get the contents from a URL.
*/
private static function getUrlContents($url) {
$ch = curl_init($url);
curl_setopt_array($ch, array(
CURLOPT_HEADER => FALSE,
// Follow http redirects to get the real URL.
CURLOPT_FOLLOWLOCATION => TRUE,
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_SSL_VERIFYHOST => FALSE,
CURLOPT_SSL_VERIFYPEER => FALSE,
CURLOPT_HTTPHEADER => array('Accept-language: en'),
CURLOPT_USERAGENT => Tweeper::$userAgent,
));
$contents = curl_exec($ch);
if (FALSE === $contents) {
trigger_error(curl_error($ch));
}
curl_close($ch);
return $contents;
}
/**
* Get the headers from a URL.
*/
private static function getUrlInfo($url) {
$ch = curl_init($url);
curl_setopt_array($ch, array(
CURLOPT_HEADER => TRUE,
CURLOPT_NOBODY => TRUE,
// Follow http redirects to get the real URL.
CURLOPT_FOLLOWLOCATION => TRUE,
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_SSL_VERIFYHOST => FALSE,
CURLOPT_SSL_VERIFYPEER => FALSE,
CURLOPT_USERAGENT => Tweeper::$userAgent,
));
curl_exec($ch);
$url_info = curl_getinfo($ch);
if (FALSE === $url_info) {
trigger_error(curl_error($ch));
}
curl_close($ch);
return $url_info;
}
/**
* Generate an RSS element.
*/
public static function generateEnclosure($url) {
$supported_content_types = array(
"application/octet-stream",
"application/ogg",
"application/pdf",
"audio/aac",
"audio/mp4",
"audio/mpeg",
"audio/ogg",
"audio/vorbis",
"audio/wav",
"audio/webm",
"audio/x-midi",
"image/gif",
"image/jpeg",
"image/png",
"video/avi",
"video/mp4",
"video/mpeg",
"video/ogg",
);
$url_info = Tweeper::getUrlInfo($url);
$supported = in_array($url_info['content_type'], $supported_content_types);
if (!$supported) {
error_log("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url']);
return '';
}
// The RSS specification says that the enclosure element URL must be http.
// See http://sourceforge.net/p/feedvalidator/bugs/72/
$http_url = preg_replace("/^https/", "http", $url_info['url']);
$dom = new DOMDocument();
$enc = $dom->createElement('enclosure');
$enc->setAttribute('url', $http_url);
$enc->setAttribute('length', $url_info['download_content_length']);
$enc->setAttribute('type', $url_info['content_type']);
return $enc;
}
/**
* Mimic the message from libxml.c::php_libxml_ctx_error_level()
*/
private static function logXmlError($error) {
$output = "";
switch ($error->level) {
case LIBXML_ERR_WARNING:
$output .= "Warning $error->code: ";
break;
case LIBXML_ERR_ERROR:
$output .= "Error $error->code: ";
break;
case LIBXML_ERR_FATAL:
$output .= "Fatal Error $error->code: ";
break;
}
$output .= trim($error->message);
if ($error->file) {
$output .= " in $error->file";
}
else {
$output .= " in Entity,";
}
$output .= " line $error->line";
error_log($output);
}
/**
* Convert json to XML.
*/
private static function jsonToXml($json, $root_node_name) {
// Apparently the ObjectNormalizer used afterwards is not able to handle
// the stdClass object created by json_decode() with the default setting
// $assoc = false; so use $assoc = true.
$data = json_decode($json, $assoc = TRUE);
if (!$data) {
return NULL;
}
$encoder = new XmlEncoder();
$normalizer = new ObjectNormalizer();
$serializer = new Serializer(array($normalizer), array($encoder));
$serializer_options = array(
'xml_encoding' => "UTF-8",
'xml_format_output' => TRUE,
'xml_root_node_name' => $root_node_name,
);
$xml_data = $serializer->serialize($data, 'xml', $serializer_options);
if (!$xml_data) {
trigger_error("Cannot serialize data", E_USER_ERROR);
return NULL;
}
return $xml_data;
}
/**
* Convert the Instagram content to XML.
*/
private function getXmlInstagramCom($html) {
// Extract the json data from the html code.
$json_match_expr = '/window._sharedData = (.*);/';
$ret = preg_match($json_match_expr, $html, $matches);
if ($ret !== 1) {
trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR);
return NULL;
}
// The "qe" object contains elements which will result in invalid XML
// element names, so remove it.
$data = json_decode($matches[1], $assoc = TRUE);
unset($data["qe"]);
$json = json_encode($data);
return Tweeper::jsonToXml($json, 'instagram');
}
/**
* Make the Facebook HTML processable.
*/
private function preprocessHtmlFacebookCom($html) {
$html = str_replace('', '', $html);
return $html;
}
/**
* Convert the HTML retrieved from the site to XML.
*/
private function htmlToXml($html, $host) {
$xmlDoc = new DOMDocument();
// Handle warnings and errors when loading invalid HTML.
$xml_errors_value = libxml_use_internal_errors(TRUE);
// If there is a host-specific method to get the XML data, use it!
$get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.');
if (method_exists($this, $get_xml_host_method)) {
$xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html));
$xmlDoc->loadXML($xml_data);
}
else {
$xmlDoc->loadHTML($html);
}
foreach (libxml_get_errors() as $xml_error) {
Tweeper::logXmlError($xml_error);
}
libxml_clear_errors();
libxml_use_internal_errors($xml_errors_value);
return $xmlDoc;
}
/**
* Load a stylesheet if the web site is supported.
*/
private function loadStylesheet($host) {
$stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl";
if (FALSE === file_exists($stylesheet)) {
trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR);
return NULL;
}
$stylesheet_contents = Tweeper::getUrlContents($stylesheet);
$xslDoc = new DOMDocument();
$xslDoc->loadXML($stylesheet_contents);
$xsltProcessor = new XSLTProcessor();
$xsltProcessor->registerPHPFunctions();
$xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure);
$xsltProcessor->importStylesheet($xslDoc);
return $xsltProcessor;
}
/**
* Convert the site content to RSS.
*/
public function tweep($src_url, $host=NULL, $validate_scheme=TRUE) {
$url = parse_url($src_url);
if (FALSE === $url) {
trigger_error("Invalid URL: $src_url", E_USER_ERROR);
return NULL;
}
if (TRUE === $validate_scheme) {
$scheme = $url["scheme"];
if (!in_array($scheme, array("http", "https"))) {
trigger_error("unsupported scheme: $scheme", E_USER_ERROR);
return NULL;
}
}
// if the host is not given derive it from the URL
if (NULL === $host) {
if (empty($url["host"])) {
trigger_error("Invalid host in URL: $src_url", E_USER_ERROR);
return NULL;
}
// Strip the leading www. to be more forgiving on input URLs.
$host = preg_replace('/^www\./', '', $url["host"]);
}
$xsltProcessor = $this->loadStylesheet($host);
if (NULL === $xsltProcessor) {
return NULL;
}
$html = Tweeper::getUrlContents($src_url);
if (FALSE === $html) {
return NULL;
}
$preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.');
if (method_exists($this, $preprocess_html_host_method)) {
$html = call_user_func_array(array($this, $preprocess_html_host_method), array($html));
}
$xmlDoc = $this->htmlToXml($html, $host);
if (NULL === $xmlDoc) {
return NULL;
}
$output = $xsltProcessor->transformToXML($xmlDoc);
if (FALSE === $output) {
trigger_error('XSL transformation failed.', E_USER_ERROR);
return NULL;
}
return $output;
}
}
tweeper-1.2.0/src/rss_converter_dilbert.com.xsl 0000664 0000000 0000000 00000011347 13244302333 0021667 0 ustar 00root root 0000000 0000000
...<![CDATA[]]>Tweeper
tweeper-1.2.0/src/rss_converter_facebook.com.xsl 0000664 0000000 0000000 00000013472 13244302333 0022014 0 ustar 00root root 0000000 0000000
https://facebook.com...<![CDATA[]]>Tweeper<![CDATA[]]>
tweeper-1.2.0/src/rss_converter_identi.ca.xsl 0000777 0000000 0000000 00000000000 13244302333 0026405 2rss_converter_pump.io.xsl ustar 00root root 0000000 0000000 tweeper-1.2.0/src/rss_converter_instagram.com.xsl 0000664 0000000 0000000 00000016014 13244302333 0022223 0 ustar 00root root 0000000 0000000
https://instagram.com...<![CDATA[
(Video)
]]>Tweeper<![CDATA[]]>
tweeper-1.2.0/src/rss_converter_pump.io.xsl 0000664 0000000 0000000 00000010767 13244302333 0021061 0 ustar 00root root 0000000 0000000
<![CDATA[]]>Tweeper
tweeper-1.2.0/src/rss_converter_twitter.com.xsl 0000664 0000000 0000000 00000022413 13244302333 0021740 0 ustar 00root root 0000000 0000000
https://twitter.com(Video) <![CDATA[(Video)white-space: pre-wrap;]]>Tweeper
tweeper-1.2.0/tests/ 0000775 0000000 0000000 00000000000 13244302333 0014324 5 ustar 00root root 0000000 0000000 tweeper-1.2.0/tests/fetch_facebook_page.sh 0000775 0000000 0000000 00000001046 13244302333 0020602 0 ustar 00root root 0000000 0000000 #!/bin/sh
#
# Facebook requires a CAPTCHA most of the times, so keep fetching the URL as
# long as needed, until the page is shown with no CAPTCHA.
set -e
USER_AGENT="Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
while true;
do
# Force language to en-us to make sure that the string matching works
OUTPUT=$(wget -nv --user-agent="$USER_AGENT" --header='Accept-Language: en-us' -O - -- "$1")
if echo $OUTPUT | grep -q -v "Security Check Required";
then
echo "$OUTPUT" > facebook.html
break
fi
sleep 5
done
tweeper-1.2.0/tests/instument_to_catch_promoted_tweets.diff 0000664 0000000 0000000 00000001264 13244302333 0024357 0 ustar 00root root 0000000 0000000 diff --git a/src/Tweeper.php b/src/Tweeper.php
index 8ac2fe3..c45aab5 100644
--- a/src/Tweeper.php
+++ b/src/Tweeper.php
@@ -355,6 +355,15 @@ class Tweeper {
$html = call_user_func_array(array($this, $preprocess_html_host_method), array($html));
}
+ // XXX REMOVE: instrumentation to catch promoted tweets
+ if ($host == "twitter.com") {
+ $twitter_promoted_match_expr = '/promoted/i';
+ $ret = preg_match($twitter_promoted_match_expr, $html, $matches);
+ if ($ret) {
+ file_put_contents("/home/ao2/TWITTER_PROMOTED_DUMP.html", $html);
+ }
+ }
+
$xmlDoc = $this->htmlToXml($html, $host);
if (NULL === $xmlDoc) {
return NULL;
tweeper-1.2.0/tests/test_information_leakage.sh 0000775 0000000 0000000 00000002020 13244302333 0021712 0 ustar 00root root 0000000 0000000 #!/bin/sh
set -e
TWEEPER="/usr/share/php/tweeper/tweeper"
#TWEEPER="./tweeper"
check_result() {
URL="$1"
FILE="$2"
RESULT="$3"
echo "URL $URL"
if [ "$RESULT" ];
then
echo "--> $FILE"
echo " exists"
else
echo "... $FILE"
echo " does not exist"
fi
echo
}
file_exists() {
FILE="$1"
URL="file://twitter.com/$FILE"
OUTPUT=$($TWEEPER $URL)
check_result "$URL" "$FILE" "$OUTPUT"
}
file_exists_on_server() {
SERVER="$1"
FILE="$2"
URL="file://twitter.com/$FILE"
OUTPUT=$(curl $SERVER/tweeper.php?src_url=$URL 2> /dev/null)
check_result "$URL" "$FILE on $SERVER" "$OUTPUT"
}
file_exists /etc/passwd || true
file_exists /etc/file_with_an_unlikely_name || true
echo "Staring a test server"
echo
php -S localhost:8000 -t $(dirname $TWEEPER) > /dev/null 2>&1 &
SERVER_PID=$!
sleep 1
file_exists_on_server http://localhost:8000 /etc/passwd || true
file_exists_on_server http://localhost:8000 /etc/file_with_an_unlikely_name || true
echo "Shutting down the test server"
kill $SERVER_PID
tweeper-1.2.0/tests/tweeper_file 0000775 0000000 0000000 00000000647 13244302333 0016733 0 ustar 00root root 0000000 0000000 #!/usr/bin/env php
\n";
if ($argc < 3) {
fwrite(STDERR, $usage);
exit(1);
}
$file_url = 'file://' . realpath($argv[1]);
$host = $argv[2];
$tweeper = new Tweeper();
$output = $tweeper->tweep($file_url, $host, false);
if (is_null($output)) {
exit(1);
}
echo $output;
tweeper-1.2.0/tweeper 0000775 0000000 0000000 00000001132 13244302333 0014560 0 ustar 00root root 0000000 0000000 #!/usr/bin/env php
element
*-h, --help*::
show the help message
EXAMPLE OF USE
--------------
Getting the RSS feed of some Twitter user:
tweeper https://twitter.com/NSACareers
Using tweeper as a filter for the Liferea feed reader:
liferea-add-feed "|tweeper https://twitter.com/NSAcareers"
To use tweeper via web there are two options (the examples assume the
installation directory to be `/usr/share/php/tweeper/`):
1. Using the PHP built-in web server:
php -S localhost:8000 -t /usr/share/php/tweeper/
+
and then visit 'http://localhost:8000/tweeper.php' in the web browser.
2. Using a generic web server with the document root in '/var/www':
sudo ln -s /usr/share/php/tweeper/tweeper.php /var/www
xdg-open http://localhost/tweeper.php?src_url=http://twitter.com/NSAcareers
+
It is enough to create the symlink only the very first time tweeper is used
this way.
NOTES
-----
In order to use tweeper with a symlink with the apache 'userdir' module, the
'SymLinksIfOwnerMatch' option must be replaced by 'FollowSymlink' in
/etc/apache2/mods-enabled/userdir.conf
EXIT STATUS
-----------
*0*::
Success
*!0*::
Failure
AUTHORS
-------
Antonio Ospite
RESOURCES
---------
Main web site:
COPYING
-------
Copyright \(C) 2013-2016 Antonio Ospite
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
tweeper-1.2.0/tweeper.php 0000664 0000000 0000000 00000005252 13244302333 0015352 0 ustar 00root root 0000000 0000000
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
require_once 'autoload.php';
use Tweeper\Tweeper;
date_default_timezone_set('UTC');
/**
* Check if the script is being run from the command line.
*/
function is_cli() {
return (php_sapi_name() === "cli");
}
/**
* Show the script usage.
*/
function usage($argv) {
if (is_cli()) {
$usage = "{$argv[0]} [-e|-h|--help] \n";
}
else {
$usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=&generate_enclosure=<0|1>");
}
return "usage: $usage";
}
/**
* Parse command line options.
*/
function parse_options_cli($argv, $argc) {
$options = array(
'generate_enclosure' => FALSE,
);
if ($argc < 2) {
return $options;
}
$cli_options = getopt("eh", array("help"));
foreach ($cli_options as $opt => $val) {
switch ($opt) {
case 'e':
$options['generate_enclosure'] = TRUE;
break;
case 'h':
case 'help':
echo usage($argv);
exit(0);
default:
fwrite(STDERR, usage($argv));
exit(1);
}
}
$options['src_url'] = $argv[count($cli_options) + 1];
return $options;
}
/**
* Parse options passed from a query string.
*/
function parse_options_query_string() {
$options = array(
'generate_enclosure' => FALSE,
);
if (isset($_GET['src_url'])) {
$options['src_url'] = $_GET['src_url'];
}
if (isset($_GET['generate_enclosure'])) {
$options['generate_enclosure'] = $_GET['generate_enclosure'] == 1;
}
return $options;
}
if (is_cli()) {
$options = parse_options_cli($argv, $argc);
$error_stream = fopen('php://stderr', 'w');
}
else {
$options = parse_options_query_string();
$error_stream = fopen('php://output', 'w');
}
if (!isset($options['src_url'])) {
fwrite($error_stream, usage(is_cli() ? $argv : NULL));
exit(1);
}
$tweeper = new Tweeper($options['generate_enclosure']);
$output = $tweeper->tweep($options['src_url']);
if (is_null($output)) {
exit(1);
}
echo $output;