pax_global_header00006660000000000000000000000064132443023330014507gustar00rootroot0000000000000052 comment=c36da4f46b23dea2b762cdde9b0e9514251e29c5 tweeper-1.2.0/000077500000000000000000000000001324430233300131625ustar00rootroot00000000000000tweeper-1.2.0/HACKING000066400000000000000000000012061324430233300141500ustar00rootroot00000000000000The code follows the Drupal coding standards: https://www.drupal.org/coding-standards Style compliance can be checked using the Coder Sniffer extension to the PEAR PHP_CodeSniffer project, for instructions about how to install Coder Sniffer see https://www.drupal.org/node/1419988 TL;DR: install drupla/coder and enable the Drupal coding standard in PHP_CodeSniffer: $ composer global require drupal/coder $ export PATH="$HOME/.config/composer/vendor/bin:$PATH" $ phpcs --config-set installed_paths $HOME/.config/composer/vendor/drupal/coder/coder_sniffer/ And then use this command to check the style: $ phpcs --standard=Drupal . tweeper-1.2.0/INSTALL000066400000000000000000000011071324430233300142120ustar00rootroot00000000000000The recommended way to install tweeper globally is to install all its files under /usr/share/php/tweeper and then make a symlink to the wrapper script "tweeper" under /usr/bin Tweeper depends on php-symfony-serializer which is used to convert json to xml for some sites which provide the timeline data in json rather than in usable html. NOTE: Tweeper also depends indirectly on php-symfony-property-access because the code relies on the ObjectNormalizer class which requires the PropertyAccess component, see http://symfony.com/doc/current/components/serializer.html#installation tweeper-1.2.0/Makefile000066400000000000000000000015121324430233300146210ustar00rootroot00000000000000# Packagers may want to override this! prefix ?= /usr/local PHP_SCRIPT_DIR ?= $(prefix)/share/php BIN_DIR := $(prefix)/bin MAN_DIR := $(prefix)/share/man TWEEPER_DIR := $(PHP_SCRIPT_DIR)/tweeper all: clean: rm -f tweeper.1 docs: a2x -f manpage tweeper.1.asciidoc installdocs: docs install -d $(DESTDIR)$(MAN_DIR)/man1 install -m644 tweeper.1 $(DESTDIR)$(MAN_DIR)/man1 install: installdocs install -d $(DESTDIR)$(TWEEPER_DIR) install -m644 *.php $(DESTDIR)$(TWEEPER_DIR) install -m755 tweeper $(DESTDIR)$(TWEEPER_DIR) install -d $(DESTDIR)$(TWEEPER_DIR)/src install -m644 src/* $(DESTDIR)$(TWEEPER_DIR)/src install -d $(DESTDIR)$(BIN_DIR) ln -rsf $(DESTDIR)$(TWEEPER_DIR)/tweeper $(DESTDIR)$(BIN_DIR)/tweeper @echo -e "\n\nINSTALLATION COMPLETE" @echo -e "Make sure '$(DESTDIR)$(PHP_SCRIPT_DIR)' is in PHP include_path!\n" tweeper-1.2.0/NEWS000066400000000000000000000071041324430233300136630ustar00rootroot00000000000000News for v1.2.0: ================ * Add support for scraping Instagram location pages, like for example https://www.instagram.com/explore/locations/833277432/ * Make scraping Instagram.com more robust * Improve and fix scraping Facebook.com pages once again * Add support for Twitter.com permalink URLs * Make the generated Twitter.com feed mach more closely the original content, now spaces and line wrap are preserved in feed reader which can render the HTML code embedded in the element, this way ASCII art tweets can be fully appreciated when read via tweeper. Check out https://twitter.com/sarahjeong/status/955651919279722496 News for v1.1.0: ================ * Make scraping Facebook.com pages more robust * Fix getting the channel image for Facebook.com pages * Add some development tools * Fix a problem with some feed readers when showing images from Twitter.com by ignoring the "style" attribute in the scraped HTML * Filter out promoted tweets when scraping Twitter.com * Remove support for Howtoons.com, the old blog is not available anymore News for v1.0.0: ================ * Support "application/octet-stream" as an enclosure content type * Support "application/pdf" as an enclosure content type * Fix information leakage by validating the URL scheme * Code restructuring to make it easier to use tweeper as a library in other projects * Allow installing tweeper via composer, the packagist page is at: https://packagist.org/packages/ao2/tweeper * Misc robustness fixes News for v0.6: ============== * Fix support for Facebook.com public pages * Fix support for Dilbert.com * Major code cleanup (coding style, functions naming) * Fix indentation when generating the element * Support generating enclosure for "image/png" links * Major improvements for Twitter.com: - embed images directly in the item description, linking to the original versions uploaded by the user; - use direct links instead of ones pointing to the t.co redirector; - show explicitly if the attached media is a video; - add enclosure element support for attached images. * Minor improvements for Instagram.com: - fix the channel link; - make images adapt to the feed reader view, this avoids horizontal scrolling if the image is too big. * Support generating enclosure for images on Dilbert.com * Support generating enclosure for images on Pump.io sites * Misc fixes to code and documentation News for v0.5: ============== * Use the Symfony Serializer component instead of the PEAR XML_Serializer * Make the Twitter stylesheet stricter to avoid empty timeline entries News for v0.4: ============== * Make the generated RSS validate with feedvalidator.org * Fix support for Dilbert.com * Add support for Instragram.com * Add support for public pages on Facebook.com * Make tweeper work with the PHP built-in web server * Misc fixes to code and documentation News for v0.3: ============== * Support generating enclosure for "audio/ogg" links * Always specify xml:base to improve local URLs expansions in some cases * Support both the classic and the new Twitter profile pages * Fix getting the profile picture of Twitter users * Add support for Howtoons.com News for v0.2: ============== * Small fixes to the man page which must be in the coming Debian package News for v0.1: ============== * Initial version supporting Twitter.com * Add support for pump.io sites * Add support for dilbert.com * Show links to supported media files in the RSS element tweeper-1.2.0/README000066400000000000000000000042021324430233300140400ustar00rootroot00000000000000Tweeper is a web scraper which can be used to conveniently follow the public activity of social network users without the need to log in or even be subscribed to the social network; tweeper converts the public information to RSS so that it can be accessed and collected by a feed reader. Since Jun 11th 2013, when Twitter.com retired their API v1.0, it has not been possible anymore to access a user timeline via RSS, and it has also become mandatory to authenticate via OAuth to access this _public_ information in the JSON format: https://dev.twitter.com/discussions/16289 https://dev.twitter.com/discussions/11564 Some services came up to overcome this "problem": http://twss.55uk.net/ http://twitter-rss.com/ (now redirecting to google.com) http://rssitfor.me However these solutions are still shady and let no control to the user about who collects the information about the visited user timelines. This is why Tweeper[1] was born, as an Open Source way to keep following your friends with a certain degree of anonymity, without having to tell Twitter.com whom you are friend to. [1] http://www.urbandictionary.com/define.php?term=TWEEPER&defid=3743173 Tweeper can easily scrape sites other than Twitter, it is just a matter of writing an xsl stylesheet for the transformation; an example for pump.io activity stream is provided in rss_converter_pump.io.xsl The currently supported sites are: * Twitter.com * Pump.io based websites, like Identi.ca * Dilbert.com * Instagram.com * Facebook.com (public pages) Tweeper can be used via web or as a command line program, for example as a filter in your feed reader, by passing the URL of the user's public timeline as the first argument. Example of use on the command line: $ php tweeper.php https://twitter.com/NSACareers Example of use as a Liferea[2] filter: $ liferea-add-feed "|php .../path_to_tweeper/tweeper.php https://twitter.com/NSAcareers" Example of use with identi.ca: $ liferea-add-feed "|php .../path_to_tweeper/tweeper.php http://identi.ca/evan" [2] http://lzone.de/liferea/ Tweeper is licensed under the GPLv3. Tweeper was written by Antonio Ospite https://ao2.it tweeper-1.2.0/TODO000066400000000000000000000013311324430233300136500ustar00rootroot00000000000000- re-evaluate the use of trigger_error() or use a custom error handler, because right now the code exists as soon as trigger_error() gets called and any following code is ignored. - write better XSL stylesheets? I am not an XSL expert - evaluate the use of the RSS element - show cards directly in RSS items for twitter.com - show direct links for videos in the Instagram feed - check the encoding of the tweets when UTF is used, maybe solvable with mb_convert_encoding()? See http://php.net/manual/en/domdocument.loadhtml.php - The dependencies on the symphony components in composer.json could be more relaxed like ">=2.7.0", but for now sticking to "2.7.*" is good enough. - Add support for instagram tags tweeper-1.2.0/autoload.php000066400000000000000000000054661324430233300155160ustar00rootroot00000000000000 * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ $package_name = 'ao2/tweeper'; if (file_exists(__DIR__ . '/vendor/autoload.php')) { /* * If "composer install" has been executed, use the composer autoloader. * * Using __DIR__ is OK as long as this file is on the same level of the * project "vendor/" directory (usually the project root directory). */ require __DIR__ . '/vendor/autoload.php'; } elseif (preg_match('/' . preg_quote('/vendor/' . $package_name, '/') . '$/', __DIR__)) { /* * If running from a "vendor/" directory of another project use the * autoloader of the parent project. * * This covers the case of running from a symlink in ./vendor/bin/ because * __DIR__ contains the *real path* of this file. * * Note that using __DIR__ here and going back two levels is OK under the * assumptions that this file is in the project root directory, and that the * package name has the structure VENDOR/PROJECT_NAME. */ require __DIR__ . '/../../autoload.php'; } else { /* * Otherwise, run without composer: * * 1. register our own autoloader function for the Tweeper class * * The implementation follows the one suggested in: * http://www.php-fig.org/psr/psr-4/ */ spl_autoload_register(function ($fully_qualified_class_name) { /* This matches the data defined for the PSR-4 autoloader in composer.json */ $namespace_prefix = 'Tweeper\\'; $base_directory = 'src/'; $len = strlen($namespace_prefix); if (strncmp($namespace_prefix, $fully_qualified_class_name, $len) !== 0) { return; } $class_relative = substr($fully_qualified_class_name, $len); $file_path = $base_directory . str_replace('\\', '/', $class_relative) . '.php'; require_once $file_path; }); /* * 2. load the system-wide autoloader from php-symphony-serializer * * This allows to run tweeper without composer, as long as the Symphony * dependencies are available system-wide. * * For example, the Debian package takes care of that. */ require_once 'Symfony/Component/Serializer/autoload.php'; } tweeper-1.2.0/composer.json000066400000000000000000000015141324430233300157050ustar00rootroot00000000000000{ "name": "ao2/tweeper", "type": "library", "description": "Tweeper is a web scraper to convert popular social media sites to RSS (e.g. Twitter.com, Instagram.com).", "keywords": ["Twitter", "Instagram", "Facebook", "RSS", "scraper"], "homepage": "https://git.ao2.it/tweeper.git", "license": "GPL-3.0+", "authors": [ { "name": "Antonio Ospite", "email": "ao2@ao2.it", "homepage": "https://ao2.it", "role": "Developer" } ], "require": { "php": ">=5.3.0", "ext-curl": "*", "ext-dom": "*", "ext-json": "*", "ext-xsl": "*", "symfony/serializer": ">=2.7.0", "symfony/property-access": ">=2.7.0" }, "autoload": { "psr-4": { "Tweeper\\": "src/" } }, "bin": ["tweeper"] } tweeper-1.2.0/src/000077500000000000000000000000001324430233300137515ustar00rootroot00000000000000tweeper-1.2.0/src/Tweeper.php000066400000000000000000000244121324430233300161000ustar00rootroot00000000000000 * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ use DOMDocument; use XSLTProcessor; use Symfony\Component\Serializer\Serializer; use Symfony\Component\Serializer\Encoder\XmlEncoder; use Symfony\Component\Serializer\Normalizer\ObjectNormalizer; date_default_timezone_set('UTC'); /** * Scrape supported websites and perform conversion to RSS. */ class Tweeper { private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0"; /** * Constructor sets up {@link $generate_enclosure}. */ public function __construct($generate_enclosure = FALSE) { $this->generate_enclosure = $generate_enclosure; } /** * Convert numeric Epoch to the date format expected in a RSS document. */ public static function epochToRssDate($timestamp) { if (!is_numeric($timestamp) || is_nan($timestamp)) { $timestamp = 0; } return gmdate(DATE_RSS, $timestamp); } /** * Convert generic date string to the date format expected in a RSS document. */ public static function strToRssDate($date) { $timestamp = strtotime($date); if (FALSE === $timestamp) { $timestamp = 0; } return Tweeper::epochToRssDate($timestamp); } /** * Convert string to UpperCamelCase. */ public static function toUpperCamelCase($str, $delim = ' ') { $str_upper = ucwords($str, $delim); $str_camel_case = str_replace($delim, '', $str_upper); return $str_camel_case; } /** * Get the contents from a URL. */ private static function getUrlContents($url) { $ch = curl_init($url); curl_setopt_array($ch, array( CURLOPT_HEADER => FALSE, // Follow http redirects to get the real URL. CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_RETURNTRANSFER => TRUE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_HTTPHEADER => array('Accept-language: en'), CURLOPT_USERAGENT => Tweeper::$userAgent, )); $contents = curl_exec($ch); if (FALSE === $contents) { trigger_error(curl_error($ch)); } curl_close($ch); return $contents; } /** * Get the headers from a URL. */ private static function getUrlInfo($url) { $ch = curl_init($url); curl_setopt_array($ch, array( CURLOPT_HEADER => TRUE, CURLOPT_NOBODY => TRUE, // Follow http redirects to get the real URL. CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_RETURNTRANSFER => TRUE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_USERAGENT => Tweeper::$userAgent, )); curl_exec($ch); $url_info = curl_getinfo($ch); if (FALSE === $url_info) { trigger_error(curl_error($ch)); } curl_close($ch); return $url_info; } /** * Generate an RSS element. */ public static function generateEnclosure($url) { $supported_content_types = array( "application/octet-stream", "application/ogg", "application/pdf", "audio/aac", "audio/mp4", "audio/mpeg", "audio/ogg", "audio/vorbis", "audio/wav", "audio/webm", "audio/x-midi", "image/gif", "image/jpeg", "image/png", "video/avi", "video/mp4", "video/mpeg", "video/ogg", ); $url_info = Tweeper::getUrlInfo($url); $supported = in_array($url_info['content_type'], $supported_content_types); if (!$supported) { error_log("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url']); return ''; } // The RSS specification says that the enclosure element URL must be http. // See http://sourceforge.net/p/feedvalidator/bugs/72/ $http_url = preg_replace("/^https/", "http", $url_info['url']); $dom = new DOMDocument(); $enc = $dom->createElement('enclosure'); $enc->setAttribute('url', $http_url); $enc->setAttribute('length', $url_info['download_content_length']); $enc->setAttribute('type', $url_info['content_type']); return $enc; } /** * Mimic the message from libxml.c::php_libxml_ctx_error_level() */ private static function logXmlError($error) { $output = ""; switch ($error->level) { case LIBXML_ERR_WARNING: $output .= "Warning $error->code: "; break; case LIBXML_ERR_ERROR: $output .= "Error $error->code: "; break; case LIBXML_ERR_FATAL: $output .= "Fatal Error $error->code: "; break; } $output .= trim($error->message); if ($error->file) { $output .= " in $error->file"; } else { $output .= " in Entity,"; } $output .= " line $error->line"; error_log($output); } /** * Convert json to XML. */ private static function jsonToXml($json, $root_node_name) { // Apparently the ObjectNormalizer used afterwards is not able to handle // the stdClass object created by json_decode() with the default setting // $assoc = false; so use $assoc = true. $data = json_decode($json, $assoc = TRUE); if (!$data) { return NULL; } $encoder = new XmlEncoder(); $normalizer = new ObjectNormalizer(); $serializer = new Serializer(array($normalizer), array($encoder)); $serializer_options = array( 'xml_encoding' => "UTF-8", 'xml_format_output' => TRUE, 'xml_root_node_name' => $root_node_name, ); $xml_data = $serializer->serialize($data, 'xml', $serializer_options); if (!$xml_data) { trigger_error("Cannot serialize data", E_USER_ERROR); return NULL; } return $xml_data; } /** * Convert the Instagram content to XML. */ private function getXmlInstagramCom($html) { // Extract the json data from the html code. $json_match_expr = '/window._sharedData = (.*);/'; $ret = preg_match($json_match_expr, $html, $matches); if ($ret !== 1) { trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR); return NULL; } // The "qe" object contains elements which will result in invalid XML // element names, so remove it. $data = json_decode($matches[1], $assoc = TRUE); unset($data["qe"]); $json = json_encode($data); return Tweeper::jsonToXml($json, 'instagram'); } /** * Make the Facebook HTML processable. */ private function preprocessHtmlFacebookCom($html) { $html = str_replace('', '', $html); return $html; } /** * Convert the HTML retrieved from the site to XML. */ private function htmlToXml($html, $host) { $xmlDoc = new DOMDocument(); // Handle warnings and errors when loading invalid HTML. $xml_errors_value = libxml_use_internal_errors(TRUE); // If there is a host-specific method to get the XML data, use it! $get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.'); if (method_exists($this, $get_xml_host_method)) { $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html)); $xmlDoc->loadXML($xml_data); } else { $xmlDoc->loadHTML($html); } foreach (libxml_get_errors() as $xml_error) { Tweeper::logXmlError($xml_error); } libxml_clear_errors(); libxml_use_internal_errors($xml_errors_value); return $xmlDoc; } /** * Load a stylesheet if the web site is supported. */ private function loadStylesheet($host) { $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl"; if (FALSE === file_exists($stylesheet)) { trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR); return NULL; } $stylesheet_contents = Tweeper::getUrlContents($stylesheet); $xslDoc = new DOMDocument(); $xslDoc->loadXML($stylesheet_contents); $xsltProcessor = new XSLTProcessor(); $xsltProcessor->registerPHPFunctions(); $xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure); $xsltProcessor->importStylesheet($xslDoc); return $xsltProcessor; } /** * Convert the site content to RSS. */ public function tweep($src_url, $host=NULL, $validate_scheme=TRUE) { $url = parse_url($src_url); if (FALSE === $url) { trigger_error("Invalid URL: $src_url", E_USER_ERROR); return NULL; } if (TRUE === $validate_scheme) { $scheme = $url["scheme"]; if (!in_array($scheme, array("http", "https"))) { trigger_error("unsupported scheme: $scheme", E_USER_ERROR); return NULL; } } // if the host is not given derive it from the URL if (NULL === $host) { if (empty($url["host"])) { trigger_error("Invalid host in URL: $src_url", E_USER_ERROR); return NULL; } // Strip the leading www. to be more forgiving on input URLs. $host = preg_replace('/^www\./', '', $url["host"]); } $xsltProcessor = $this->loadStylesheet($host); if (NULL === $xsltProcessor) { return NULL; } $html = Tweeper::getUrlContents($src_url); if (FALSE === $html) { return NULL; } $preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.'); if (method_exists($this, $preprocess_html_host_method)) { $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html)); } $xmlDoc = $this->htmlToXml($html, $host); if (NULL === $xmlDoc) { return NULL; } $output = $xsltProcessor->transformToXML($xmlDoc); if (FALSE === $output) { trigger_error('XSL transformation failed.', E_USER_ERROR); return NULL; } return $output; } } tweeper-1.2.0/src/rss_converter_dilbert.com.xsl000066400000000000000000000113471324430233300216670ustar00rootroot00000000000000 <xsl:variable name="title-length" select="140"/> <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> <xsl:choose> <xsl:when test="string-length($picture-title) > $title-length"> <xsl:variable name="truncated-length" select="$title-length - 3"/> <xsl:value-of select="substring($picture-title, 1, $truncated-length)"/> <xsl:text>...</xsl:text> </xsl:when> <xsl:otherwise> <xsl:value-of select="$picture-title"/> </xsl:otherwise> </xsl:choose> <![CDATA[ {$picture-title} ]]> Tweeper <xsl:value-of select="$channel-title"/> <xsl:value-of select="$channel-title"/> tweeper-1.2.0/src/rss_converter_facebook.com.xsl000066400000000000000000000134721324430233300220140ustar00rootroot00000000000000 https://facebook.com <xsl:variable name="item-title" select="$item-content//p"/> <xsl:variable name="title-length" select="140"/> <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> <xsl:choose> <xsl:when test="string-length($item-title) > $title-length"> <xsl:variable name="truncated-length" select="$title-length - 3"/> <xsl:value-of select="substring($item-title, 1, $truncated-length)"/> <xsl:text>...</xsl:text> </xsl:when> <xsl:otherwise> <xsl:value-of select="$item-title"/> </xsl:otherwise> </xsl:choose> <![CDATA[ ]]> Tweeper <xsl:value-of select="$channel-title"/> <![CDATA[ ]]> <xsl:value-of select="$channel-title"/> tweeper-1.2.0/src/rss_converter_identi.ca.xsl000077700000000000000000000000001324430233300264052rss_converter_pump.io.xslustar00rootroot00000000000000tweeper-1.2.0/src/rss_converter_instagram.com.xsl000066400000000000000000000160141324430233300222230ustar00rootroot00000000000000 https://instagram.com <xsl:variable name="title-length" select="140"/> <xsl:variable name="item-content-title" select="normalize-space(concat($user-name, ': ', $item-content-caption))"/> <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> <xsl:choose> <xsl:when test="string-length($item-content-title) > $title-length"> <xsl:variable name="truncated-length" select="$title-length - 3"/> <xsl:value-of select="substring($item-content-title, 1, $truncated-length)"/> <xsl:text>...</xsl:text> </xsl:when> <xsl:otherwise> <xsl:value-of select="$item-content-title"/> </xsl:otherwise> </xsl:choose> <![CDATA[

(Video)


]]>
Tweeper <xsl:value-of select="$channel-title"/> <![CDATA[ ]]> <xsl:value-of select="$channel-title"/>
tweeper-1.2.0/src/rss_converter_pump.io.xsl000066400000000000000000000107671324430233300210610ustar00rootroot00000000000000 <xsl:value-of select="concat($user-name, ': ', normalize-space($item-content))"/> <![CDATA[ ]]> Tweeper <xsl:value-of select="$channel-title"/> <xsl:value-of select="$channel-title"/> tweeper-1.2.0/src/rss_converter_twitter.com.xsl000066400000000000000000000224131324430233300217400ustar00rootroot00000000000000 https://twitter.com <xsl:value-of select="concat($user-name, ': ')"/> <xsl:if test="$item-has-video"> <xsl:text>(Video) </xsl:text> </xsl:if> <!-- Prepend a space in front of the URLs which are not preceded by an open parenthesis, for aestethic reasons. Also, regex, I know: http://xkcd.com/1171/ --> <xsl:variable name="processed-title" select="php:functionString('preg_replace', '@((?<!\()(?:http[s]?://|pic.twitter.com))@', ' \1', $item-content)"/> <!-- Also strip   and … --> <xsl:value-of select="normalize-space(translate($processed-title, ' …', ''))"/> <![CDATA[ (Video) white-space: pre-wrap; ]]> Tweeper <xsl:value-of select="$channel-title"/> <xsl:value-of select="$channel-title"/> tweeper-1.2.0/tests/000077500000000000000000000000001324430233300143245ustar00rootroot00000000000000tweeper-1.2.0/tests/fetch_facebook_page.sh000077500000000000000000000010461324430233300206020ustar00rootroot00000000000000#!/bin/sh # # Facebook requires a CAPTCHA most of the times, so keep fetching the URL as # long as needed, until the page is shown with no CAPTCHA. set -e USER_AGENT="Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0"; while true; do # Force language to en-us to make sure that the string matching works OUTPUT=$(wget -nv --user-agent="$USER_AGENT" --header='Accept-Language: en-us' -O - -- "$1") if echo $OUTPUT | grep -q -v "Security Check Required"; then echo "$OUTPUT" > facebook.html break fi sleep 5 done tweeper-1.2.0/tests/instument_to_catch_promoted_tweets.diff000066400000000000000000000012641324430233300243570ustar00rootroot00000000000000diff --git a/src/Tweeper.php b/src/Tweeper.php index 8ac2fe3..c45aab5 100644 --- a/src/Tweeper.php +++ b/src/Tweeper.php @@ -355,6 +355,15 @@ class Tweeper { $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html)); } + // XXX REMOVE: instrumentation to catch promoted tweets + if ($host == "twitter.com") { + $twitter_promoted_match_expr = '/promoted/i'; + $ret = preg_match($twitter_promoted_match_expr, $html, $matches); + if ($ret) { + file_put_contents("/home/ao2/TWITTER_PROMOTED_DUMP.html", $html); + } + } + $xmlDoc = $this->htmlToXml($html, $host); if (NULL === $xmlDoc) { return NULL; tweeper-1.2.0/tests/test_information_leakage.sh000077500000000000000000000020201324430233300217120ustar00rootroot00000000000000#!/bin/sh set -e TWEEPER="/usr/share/php/tweeper/tweeper" #TWEEPER="./tweeper" check_result() { URL="$1" FILE="$2" RESULT="$3" echo "URL $URL" if [ "$RESULT" ]; then echo "--> $FILE" echo " exists" else echo "... $FILE" echo " does not exist" fi echo } file_exists() { FILE="$1" URL="file://twitter.com/$FILE" OUTPUT=$($TWEEPER $URL) check_result "$URL" "$FILE" "$OUTPUT" } file_exists_on_server() { SERVER="$1" FILE="$2" URL="file://twitter.com/$FILE" OUTPUT=$(curl $SERVER/tweeper.php?src_url=$URL 2> /dev/null) check_result "$URL" "$FILE on $SERVER" "$OUTPUT" } file_exists /etc/passwd || true file_exists /etc/file_with_an_unlikely_name || true echo "Staring a test server" echo php -S localhost:8000 -t $(dirname $TWEEPER) > /dev/null 2>&1 & SERVER_PID=$! sleep 1 file_exists_on_server http://localhost:8000 /etc/passwd || true file_exists_on_server http://localhost:8000 /etc/file_with_an_unlikely_name || true echo "Shutting down the test server" kill $SERVER_PID tweeper-1.2.0/tests/tweeper_file000077500000000000000000000006471324430233300167330ustar00rootroot00000000000000#!/usr/bin/env php \n"; if ($argc < 3) { fwrite(STDERR, $usage); exit(1); } $file_url = 'file://' . realpath($argv[1]); $host = $argv[2]; $tweeper = new Tweeper(); $output = $tweeper->tweep($file_url, $host, false); if (is_null($output)) { exit(1); } echo $output; tweeper-1.2.0/tweeper000077500000000000000000000011321324430233300145600ustar00rootroot00000000000000#!/usr/bin/env php element *-h, --help*:: show the help message EXAMPLE OF USE -------------- Getting the RSS feed of some Twitter user: tweeper https://twitter.com/NSACareers Using tweeper as a filter for the Liferea feed reader: liferea-add-feed "|tweeper https://twitter.com/NSAcareers" To use tweeper via web there are two options (the examples assume the installation directory to be `/usr/share/php/tweeper/`): 1. Using the PHP built-in web server: php -S localhost:8000 -t /usr/share/php/tweeper/ + and then visit 'http://localhost:8000/tweeper.php' in the web browser. 2. Using a generic web server with the document root in '/var/www': sudo ln -s /usr/share/php/tweeper/tweeper.php /var/www xdg-open http://localhost/tweeper.php?src_url=http://twitter.com/NSAcareers + It is enough to create the symlink only the very first time tweeper is used this way. NOTES ----- In order to use tweeper with a symlink with the apache 'userdir' module, the 'SymLinksIfOwnerMatch' option must be replaced by 'FollowSymlink' in /etc/apache2/mods-enabled/userdir.conf EXIT STATUS ----------- *0*:: Success *!0*:: Failure AUTHORS ------- Antonio Ospite RESOURCES --------- Main web site: COPYING ------- Copyright \(C) 2013-2016 Antonio Ospite This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. tweeper-1.2.0/tweeper.php000066400000000000000000000052521324430233300153520ustar00rootroot00000000000000 * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ require_once 'autoload.php'; use Tweeper\Tweeper; date_default_timezone_set('UTC'); /** * Check if the script is being run from the command line. */ function is_cli() { return (php_sapi_name() === "cli"); } /** * Show the script usage. */ function usage($argv) { if (is_cli()) { $usage = "{$argv[0]} [-e|-h|--help] \n"; } else { $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=&generate_enclosure=<0|1>"); } return "usage: $usage"; } /** * Parse command line options. */ function parse_options_cli($argv, $argc) { $options = array( 'generate_enclosure' => FALSE, ); if ($argc < 2) { return $options; } $cli_options = getopt("eh", array("help")); foreach ($cli_options as $opt => $val) { switch ($opt) { case 'e': $options['generate_enclosure'] = TRUE; break; case 'h': case 'help': echo usage($argv); exit(0); default: fwrite(STDERR, usage($argv)); exit(1); } } $options['src_url'] = $argv[count($cli_options) + 1]; return $options; } /** * Parse options passed from a query string. */ function parse_options_query_string() { $options = array( 'generate_enclosure' => FALSE, ); if (isset($_GET['src_url'])) { $options['src_url'] = $_GET['src_url']; } if (isset($_GET['generate_enclosure'])) { $options['generate_enclosure'] = $_GET['generate_enclosure'] == 1; } return $options; } if (is_cli()) { $options = parse_options_cli($argv, $argc); $error_stream = fopen('php://stderr', 'w'); } else { $options = parse_options_query_string(); $error_stream = fopen('php://output', 'w'); } if (!isset($options['src_url'])) { fwrite($error_stream, usage(is_cli() ? $argv : NULL)); exit(1); } $tweeper = new Tweeper($options['generate_enclosure']); $output = $tweeper->tweep($options['src_url']); if (is_null($output)) { exit(1); } echo $output;