pax_global_header00006660000000000000000000000064125753354010014517gustar00rootroot0000000000000052 comment=21834a24e33e223229427361b1bd2c793801ddd4 tweeper-0.4/000077500000000000000000000000001257533540100130355ustar00rootroot00000000000000tweeper-0.4/INSTALL000066400000000000000000000005161257533540100140700ustar00rootroot00000000000000The recommended way to install tweeper globally is to install all its files under /usr/share/php/tweeper and then make a symlink to the wrapper script "tweeper" under /usr/bin Tweeper depends on php-xml-serializer which is used to convert json to xml for some sites that provide the timeline data in json rather than in usable html. tweeper-0.4/Makefile000066400000000000000000000014101257533540100144710ustar00rootroot00000000000000# Packagers may want to override this! prefix ?= /usr/local PHP_SCRIPT_DIR ?= $(prefix)/share/php BIN_DIR := $(prefix)/bin MAN_DIR := $(prefix)/share/man TWEEPER_DIR := $(PHP_SCRIPT_DIR)/tweeper all: clean: rm -f tweeper.1 docs: a2x -f manpage tweeper.1.asciidoc installdocs: docs install -d $(DESTDIR)$(MAN_DIR)/man1 install -m644 tweeper.1 $(DESTDIR)$(MAN_DIR)/man1 install: installdocs install -d $(DESTDIR)$(TWEEPER_DIR) install -m644 *.xsl $(DESTDIR)$(TWEEPER_DIR) install -m644 *.php $(DESTDIR)$(TWEEPER_DIR) install -m755 tweeper $(DESTDIR)$(TWEEPER_DIR) install -d $(DESTDIR)$(BIN_DIR) ln -sf $(TWEEPER_DIR)/tweeper $(DESTDIR)$(BIN_DIR)/tweeper @echo -e "\n\nINSTALLATION COMPLETE" @echo -e "Make sure '$(PHP_SCRIPT_DIR)' is in PHP include_path!\n" tweeper-0.4/NEWS000066400000000000000000000016451257533540100135420ustar00rootroot00000000000000News for v0.4: ============== * Make the generated RSS validate with feedvalidator.org * Fix support for Dilbert.com * Add support for Instragram.com * Add support for public pages on Facebook.com * Make tweeper work with the PHP built-in web server * Misc fixes to code and documentation News for v0.3: ============== * Support generating enclosure for "audio/ogg" links * Always specify xml:base to improve local URLs expansions in some cases * Support both the classic and the new Twitter profile pages * Fix getting the profile picture of Twitter users * Add support for Howtoons.com News for v0.2: ============== * Small fixes to the man page which must be in the coming Debian package News for v0.1: ============== * Initial version supporting Twitter.com * Add support for pump.io sites * Add support for dilbert.com * Show links to supported media files in the RSS element tweeper-0.4/README000066400000000000000000000042201257533540100137130ustar00rootroot00000000000000Tweeper is a web scraper which can be used to conveniently follow the public activity of social network users without the need to log in or even be subscribed to the social network; tweeper converts the public information to RSS so that it can be accessed and collected by a feed reader. Since Jun 11th 2013, when Twitter.com retired their API v1.0, it has not been possible anymore to access a user timeline via RSS, and it has also become mandatory to authenticate via OAuth to access this _public_ information in the JSON format: https://dev.twitter.com/discussions/16289 https://dev.twitter.com/discussions/11564 Some services came up to overcome this "problem": http://twss.55uk.net/ http://twitter-rss.com/ (now redirecting to google.com) http://rssitfor.me However these solutions are still shady and let no control to the user about who collects the information about the visited user timelines. This is why Tweeper[1] was born, as an Open Source way to keep following your friends with a certain degree of anonymity, without having to tell Twitter.com whom you are friend to. [1] http://www.urbandictionary.com/define.php?term=TWEEPER&defid=3743173 Tweeper can easily scrape sites other than Twitter, it is just a matter of writing an xsl stylesheet for the transformation; an example for pump.io activity stream is provided in rss_converter_pump.io.xsl The currently supported sites are: * Twitter.com * Pump.io based websites, like Identi.ca * Dilbert.com * Howtoons.com * Instagram.com * Facebook.com (public pages) Tweeper can be used via web or as a command line program, for example as a filter in your feed reader, by passing the URL of the user's public timeline as the first argument. Example of use on the command line: $ php tweeper.php http://twitter.com/NSACareers Example of use as a Liferea[2] filter: $ liferea-add-feed "|php .../path_to_tweeper/tweeper.php http://twitter.com/NSAcareers" Example of use with identi.ca: $ liferea-add-feed "|php .../path_to_tweeper/tweeper.php http://identi.ca/evan" [2] http://lzone.de/liferea/ Tweeper is licensed under the GPLv3. Tweeper was written by Antonio Ospite http://ao2.it tweeper-0.4/TODO000066400000000000000000000006671257533540100135360ustar00rootroot00000000000000- write a better XSL stylesheet? I am not an XSL expert. - evaluate the use of the RSS element. - use the element for pump.io media objects - use the element for images on dilbert.com - show images (or even cards) directly in RSS items for twitter.com - check the encoding of the tweets when UTF is used, maybe solvable with mb_convert_encoding()? See http://php.net/manual/en/domdocument.loadhtml.php tweeper-0.4/rss_converter_dilbert.com.xsl000066400000000000000000000076051257533540100207550ustar00rootroot00000000000000 <xsl:value-of select="$picture-title"/> <![CDATA[ ]]> Tweeper <xsl:value-of select="$channel-title"/> <xsl:value-of select="$channel-title"/> tweeper-0.4/rss_converter_facebook.com.xsl000066400000000000000000000114601257533540100210730ustar00rootroot00000000000000 https://facebook.com <xsl:variable name="item-title" select="$item-content/p"/> <xsl:variable name="title-length" select="140"/> <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> <xsl:choose> <xsl:when test="string-length($item-title) > $title-length"> <xsl:variable name="truncated-length" select="$title-length - 3"/> <xsl:value-of select="substring($item-title, 1, $truncated-length)"/> <xsl:text>...</xsl:text> </xsl:when> <xsl:otherwise> <xsl:value-of select="$item-title"/> </xsl:otherwise> </xsl:choose> <![CDATA[ ]]> Tweeper <xsl:value-of select="$channel-title"/> <![CDATA[ ]]> <xsl:value-of select="$channel-title"/> tweeper-0.4/rss_converter_howtoons.com.xsl000066400000000000000000000100221257533540100211730ustar00rootroot00000000000000 http://howtoons.com <xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/> <![CDATA[ ]]> Tweeper <xsl:value-of select="$channel-title"/> The world's greatest D.I.Y. comic website! Tools of mass construction! <xsl:value-of select="$channel-title"/> http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png tweeper-0.4/rss_converter_identi.ca.xsl000077700000000000000000000000001257533540100254712rss_converter_pump.io.xslustar00rootroot00000000000000tweeper-0.4/rss_converter_instagram.com.xsl000066400000000000000000000136261257533540100213150ustar00rootroot00000000000000 https://instagram.com <xsl:variable name="title-length" select="140"/> <xsl:variable name="item-content-title" select="normalize-space(concat($user-name, ': ', $item-content-caption))"/> <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> <xsl:choose> <xsl:when test="string-length($item-content-title) > $title-length"> <xsl:variable name="truncated-length" select="$title-length - 3"/> <xsl:value-of select="substring($item-content-title, 1, $truncated-length)"/> <xsl:text>...</xsl:text> </xsl:when> <xsl:otherwise> <xsl:value-of select="$item-content-title"/> </xsl:otherwise> </xsl:choose> <![CDATA[

(Video)


]]>
Tweeper <xsl:value-of select="$channel-title"/> <![CDATA[ ]]> <xsl:value-of select="$channel-title"/>
tweeper-0.4/rss_converter_pump.io.xsl000066400000000000000000000077671257533540100201530ustar00rootroot00000000000000 <xsl:value-of select="concat($user-name, ': ', normalize-space($item-content))"/> <![CDATA[ ]]> Tweeper <xsl:value-of select="$channel-title"/> <xsl:value-of select="$channel-title"/> tweeper-0.4/rss_converter_twitter.com.xsl000066400000000000000000000104431257533540100210240ustar00rootroot00000000000000 https://twitter.com <xsl:value-of select="concat($user-name, ': ', $item-content)"/> <![CDATA[ ]]> Tweeper <xsl:value-of select="$channel-title"/> <xsl:value-of select="$channel-title"/> tweeper-0.4/tweeper000077500000000000000000000001071257533540100144340ustar00rootroot00000000000000#!/usr/bin/env php element *-h, --help*:: show the help message EXAMPLE OF USE -------------- Getting the RSS feed of some Twitter user: tweeper http://twitter.com/NSACareers Using tweeper as a filter for the Liferea feed reader: liferea-add-feed "|tweeper http://twitter.com/NSAcareers" To use tweeper via web there are two options (the examples assume the installation directory to be `/usr/share/php/tweeper/`): 1. Using the PHP built-in web server: php -S localhost:8000 -t /usr/share/php/tweeper/ + and then visit 'http://localhost:8000/tweeper.php' in the web browser. 2. Using a generic web server with the document root in '/var/www': sudo ln -s /usr/share/php/tweeper/tweeper.php /var/www xdg-open http://localhost/tweeper.php?src_url=http://twitter.com/NSAcareers + It is enough to create the symlink only the very first time tweeper is used this way. NOTES ----- In order to use tweeper with a symlink with the apache 'userdir' module, the 'SymLinksIfOwnerMatch' option must be replaced by 'FollowSymlink' in /etc/apache2/mods-enabled/userdir.conf EXIT STATUS ----------- *0*:: Success *!0*:: Failure AUTHORS ------- Antonio Ospite RESOURCES --------- Main web site: COPYING ------- Copyright \(C) 2013-2015 Antonio Ospite This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. tweeper-0.4/tweeper.php000066400000000000000000000226341257533540100152300ustar00rootroot00000000000000 * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ require_once 'XML/Serializer.php'; date_default_timezone_set('UTC'); class Tweeper { private static $USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0"; public function __construct($generate_enclosure = FALSE) { $this->generate_enclosure = $generate_enclosure; } public static function epoch_to_gmdate($timestamp) { if (!is_numeric($timestamp) || is_nan($timestamp)) { $timestamp = 0; } return gmdate('D, d M Y H:i:s', $timestamp) . ' GMT'; } public static function str_to_gmdate($date) { $timestamp = strtotime($date); if (FALSE === $timestamp) { $timestamp = 0; } return Tweeper::epoch_to_gmdate($timestamp); } private static function get_contents($url) { $ch = curl_init($url); curl_setopt_array($ch, array( CURLOPT_HEADER => FALSE, CURLOPT_FOLLOWLOCATION => TRUE, // follow http redirects to get the real URL CURLOPT_RETURNTRANSFER => TRUE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_HTTPHEADER => array('Accept-language: en'), CURLOPT_USERAGENT => Tweeper::$USER_AGENT, )); $contents = curl_exec($ch); curl_close($ch); return $contents; } private static function get_info($url) { $ch = curl_init($url); curl_setopt_array($ch, array( CURLOPT_HEADER => TRUE, CURLOPT_NOBODY => TRUE, CURLOPT_FOLLOWLOCATION => TRUE, // follow http redirects to get the real URL CURLOPT_RETURNTRANSFER => TRUE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_USERAGENT => Tweeper::$USER_AGENT, )); curl_exec($ch); $url_info = curl_getinfo($ch); curl_close($ch); return $url_info; } public static function generate_enclosure($url) { $supported_content_types = array( "application/ogg", "audio/aac", "audio/mp4", "audio/mpeg", "audio/ogg", "audio/vorbis", "audio/wav", "audio/webm", "audio/x-midi", "image/gif", "image/jpeg", "video/avi", "video/mp4", "video/mpeg", "video/ogg", ); // The RSS specification says that the enclosure element url must be http. // See http://sourceforge.net/p/feedvalidator/bugs/72/ $http_url = preg_replace("/^https/", "http", $url); $url_info = Tweeper::get_info($http_url); $supported = in_array($url_info['content_type'], $supported_content_types); if (!$supported) { error_log("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url']); return ''; } $dom = new DomDocument(); $enc = $dom->createElement('enclosure'); $enc->setAttribute('url', $url_info['url']); $enc->setAttribute('length', $url_info['download_content_length']); $enc->setAttribute('type', $url_info['content_type']); $dom->appendChild($enc); return $dom->saveXML($enc); } /* Mimic the message from libxml.c::php_libxml_ctx_error_level() */ private function log_xml_error($error) { $output = ""; switch ($error->level) { case LIBXML_ERR_WARNING: $output .= "Warning $error->code: "; break; case LIBXML_ERR_ERROR: $output .= "Error $error->code: "; break; case LIBXML_ERR_FATAL: $output .= "Fatal Error $error->code: "; break; } $output .= trim($error->message); if ($error->file) { $output .= " in $error->file"; } else { $output .= " in Entity,"; } $output .=" line $error->line"; error_log($output); } private function load_stylesheet($host) { $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl"; if (FALSE === file_exists($stylesheet)) { trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR); return NULL; } $stylesheet_contents = $this->get_contents($stylesheet); $xslDoc = new DOMDocument(); $xslDoc->loadXML($stylesheet_contents); $xsltProcessor = new XSLTProcessor(); $xsltProcessor->registerPHPFunctions(); $xsltProcessor->setParameter('', 'generateEnclosure', $this->generate_enclosure); $xsltProcessor->importStylesheet($xslDoc); return $xsltProcessor; } private function json_to_xml($html, $json_match_expr, $rootName) { // pre-process, convert json to XML $ret = preg_match($json_match_expr, $html, $matches); if ($ret !== 1) { trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR); return NULL; } $data = json_decode($matches[1]); if (!$data) { return NULL; } $serializer_options = array ( 'addDecl' => TRUE, 'encoding' => "UTF-8", 'indent' => ' ', 'rootName' => $rootName, ); $serializer = new XML_Serializer($serializer_options); $status = $serializer->serialize($data); if (PEAR::isError($status)) { trigger_error($status->getMessage(), E_USER_ERROR); return NULL; } return $serializer->getSerializedData(); } private function get_xml_instagram_com($html) { return $this->json_to_xml($html, '/window._sharedData = (.*);/', 'instagram'); } private function preprocess_html_facebook_com($html) { $html = str_replace('', '', $html); return $html; } private function html_to_xml($html, $host) { $xmlDoc = new DOMDocument(); // Handle warnings and errors when loading invalid HTML. $xml_errors_value = libxml_use_internal_errors(true); // If there is a host-specific method to get the xml data, use it! $get_xml_host_method = 'get_xml_' . str_replace(".", "_", $host); if (method_exists($this, $get_xml_host_method)) { $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html)); $xmlDoc->loadXML($xml_data); } else { $xmlDoc->loadHTML($html); } foreach (libxml_get_errors() as $xml_error) { $this->log_xml_error($xml_error); } libxml_clear_errors(); libxml_use_internal_errors($xml_errors_value); return $xmlDoc; } public function tweep($src_url) { $url = parse_url($src_url); if (FALSE === $url || empty($url["host"])) { trigger_error("Invalid url: $src_url", E_USER_ERROR); return NULL; } // Strip the leading www. to be more forgiving on input URLs $host = preg_replace('/^www\./', '', $url["host"]); $xsltProcessor = $this->load_stylesheet($host); if (NULL === $xsltProcessor) { return NULL; } $html = $this->get_contents($src_url); if (FALSE === $html) { return NULL; } $preprocess_html_host_method = 'preprocess_html_' . str_replace(".", "_", $host); if (method_exists($this, $preprocess_html_host_method)) { $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html)); } $xmlDoc = $this->html_to_xml($html, $host); if (NULL === $xmlDoc) { return NULL; } $output = $xsltProcessor->transformToXML($xmlDoc); if (FALSE === $output) { trigger_error('XSL transformation failed.', E_USER_ERROR); return NULL; } return $output; } } function is_cli() { return (php_sapi_name() === "cli"); } function usage($argv) { if (is_cli()) { $usage = "{$argv[0]} [-e|-h|--help] \n"; } else { $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=&generate_enclosure=<0|1>"); } return "usage: $usage"; } function parse_options_cli($argv, $argc) { $options = array( 'generate_enclosure' => FALSE ); if ($argc < 2) return $options; $cli_options = getopt("eh", array("help")); foreach ($cli_options as $opt => $val) { switch ($opt) { case 'e': $options['generate_enclosure'] = TRUE; break; case 'h': case 'help': echo usage($argv); exit(0); default: fwrite(STDERR, usage($argv)); exit(1); } } $options['src_url'] = $argv[count($cli_options) + 1]; return $options; } function parse_options_query_string() { $options = array( 'generate_enclosure' => FALSE ); if (isset($_GET['src_url'])) $options['src_url'] = $_GET['src_url']; if (isset($_GET['generate_enclosure'])) $options['generate_enclosure'] = $_GET['generate_enclosure'] == 1; return $options; } if (is_cli()) { $options = parse_options_cli($argv, $argc); $ERROR_STREAM = fopen('php://stderr', 'w'); } else { $options = parse_options_query_string(); $ERROR_STREAM = fopen('php://output', 'w'); } if (!isset($options['src_url'])) { fwrite($ERROR_STREAM, usage(is_cli() ? $argv : NULL)); exit(1); } $tweeper = new Tweeper($options['generate_enclosure']); echo $tweeper->tweep($options['src_url']);