pax_global_header00006660000000000000000000000064122424027620014513gustar00rootroot0000000000000052 comment=3e7e6750533bddf1910b7d07d5a283330b051ca3 tweeper-0.2/000077500000000000000000000000001224240276200130275ustar00rootroot00000000000000tweeper-0.2/ChangeLog000066400000000000000000000147571224240276200146170ustar00rootroot000000000000002013-11-18 12:56:52 +0100 Antonio Ospite * tweeper.1.asciidoc: small fixes to the man page (HEAD, origin/master, master) 2013-11-18 12:12:16 +0100 Antonio Ospite * tweeper.1.asciidoc: add a missing semicolon 2013-11-18 01:01:29 +0100 Antonio Ospite * Add a ChangeLog file (tag: v0.1) 2013-11-18 00:59:58 +0100 Antonio Ospite * Add a NEWS file 2013-11-18 00:59:20 +0100 Antonio Ospite * Add a Makefile rule to generate a Changelog file 2013-11-18 00:43:00 +0100 Antonio Ospite * Add a man page 2013-11-08 16:22:26 +0100 Antonio Ospite * Add a Makefile to simplify installation and packaging 2013-11-08 16:17:48 +0100 Antonio Ospite * Add a wrapper script intended to be called as an executable 2013-11-08 15:01:25 +0100 Antonio Ospite * Write error messages on STDERR and return saner values in CLI mode 2013-11-08 13:09:19 +0100 Antonio Ospite * TODO: add more info about checking UTF output 2013-11-08 10:44:44 +0100 Antonio Ospite * Handle errors and warnings from loadHTML() 2013-10-06 11:01:46 +0200 Antonio Ospite * Show the actual name of the user the tweet comes from 2013-08-12 10:16:27 +0200 Antonio Ospite * Follow HTTP redirects in get_contents() too 2013-08-12 01:25:56 +0200 Antonio Ospite * Add some entries to the TODO file 2013-08-12 01:22:35 +0200 Antonio Ospite * Merge branch 'generate-enclosure-element' 2013-08-12 01:16:10 +0200 Antonio Ospite * Cosmetics: re-indent cURL options to follow the coding style (generate-enclosure-element) 2013-08-12 01:13:56 +0200 Antonio Ospite * Use cURL for Tweeper::get_contents() too 2013-08-12 01:06:23 +0200 Antonio Ospite * Remove double semicolon in Tweeper::get_info() 2013-08-11 21:23:42 +0200 Antonio Ospite * Make get_url_info() and generate_enclosure() static methods 2013-08-11 21:15:41 +0200 Antonio Ospite * Turn epoch_to_gmdate() and str_to_gmdate() into static methods 2013-08-11 21:11:03 +0200 Antonio Ospite * Make get_contents() a static method 2013-08-11 20:57:02 +0200 Antonio Ospite * Cosmetics: sort supported_content_types, remove unneeded spaces 2013-08-11 20:52:47 +0200 Antonio Ospite * Use an array to list supported content types for enclosures 2013-08-11 20:44:37 +0200 Antonio Ospite * Make it optional to generate the element 2013-08-11 20:27:36 +0200 Antonio Ospite * Use getopt() to parse command line options 2013-08-11 20:08:37 +0200 Antonio Ospite * Split parsing CLI options from parsing QUERY_STRING ones 2013-08-11 13:43:05 +0200 Antonio Ospite * Use templates to generate enclosures 2013-08-11 12:48:21 +0200 Antonio Ospite * Merge https://github.com/grote/Tweeper into generate-encolure-elements 2013-08-11 12:43:42 +0200 Antonio Ospite * Fix a typo: s/tweeter/Twitter/ 2013-08-04 23:22:02 +0200 Torsten Grote * only enclosify certain mimetypes, use same user agent 2013-08-04 22:00:51 +0200 Torsten Grote * add initial support for enclosures 2013-08-03 20:56:55 +0200 Antonio Ospite * Fix a typo in an error message 2013-07-28 22:34:06 +0200 Antonio Ospite * Add an RSS conversion stylesheet for dilbert.com 2013-07-28 22:30:26 +0200 Antonio Ospite * TODO: mention the RSS element 2013-07-28 22:28:55 +0200 Antonio Ospite * rss_converter_twitter.com.xsl: use concat() more 2013-07-27 17:14:07 +0200 Antonio Ospite * Add an example with identi.ca 2013-07-27 17:05:03 +0200 Antonio Ospite * Mention in the README that other sites can be converted to RSS (local-ao2) 2013-07-27 16:51:38 +0200 Antonio Ospite * Add initial support for scraping Pump.io activity streams 2013-07-27 16:46:23 +0200 Antonio Ospite * Change mode of tweeper.php 2013-07-27 16:45:47 +0200 Antonio Ospite * Add -h and --help options 2013-07-27 16:38:46 +0200 Antonio Ospite * Add another date conversion routine 2013-07-27 16:36:36 +0200 Antonio Ospite * Update the documentation to use URLs as arguments 2013-07-27 16:35:47 +0200 Antonio Ospite * Mention http://rssitfor.me as an alternative service 2013-07-27 16:04:41 +0200 Antonio Ospite * Use __DIR__ when building the stylesheet path name 2013-07-27 16:01:36 +0200 Antonio Ospite * Rename formatDate() function to epoch_to_gmdate() 2013-07-27 13:31:59 +0200 Antonio Ospite * Be more verbose in error messages 2013-07-27 13:24:44 +0200 Antonio Ospite * Make stylesheet file name parametric 2013-07-27 13:09:08 +0200 Antonio Ospite * Change of behavior| Now a URL is required as an argument 2013-07-27 12:49:21 +0200 Antonio Ospite * Factor out a usage() function 2013-07-27 12:43:16 +0200 Antonio Ospite * Use php_sapi_name() to check for CLI interface 2013-07-07 15:34:21 +0200 Antonio Ospite * Fix a typo 2013-07-07 15:33:26 +0200 Antonio Ospite * Add more info about how to call Tweeper from command line 2013-07-07 01:22:47 +0200 Antonio Ospite * Embed the full HTML content of the tweet in the description field 2013-07-06 23:06:12 +0200 Antonio Ospite * Format dates using an external php function 2013-07-06 21:51:53 +0200 Antonio Ospite * Initial import tweeper-0.2/INSTALL000066400000000000000000000002601224240276200140560ustar00rootroot00000000000000The recommended way to install tweeper globally is to install all its files under /usr/share/php/tweeper and then make a symlink to the wrapper script "tweeper" under /usr/bin tweeper-0.2/Makefile000066400000000000000000000015321224240276200144700ustar00rootroot00000000000000# Packagers may want to override this! prefix ?= /usr/local PHP_SCRIPT_DIR ?= $(prefix)/share/php BIN_DIR := $(prefix)/bin MAN_DIR := $(prefix)/share/man TWEEPER_DIR := $(PHP_SCRIPT_DIR)/tweeper all: clean: rm -f tweeper.1 changelog: git log --pretty="format:%ai %aN <%aE>%n%n%x09* %s%d%n" > ChangeLog docs: a2x -f manpage tweeper.1.asciidoc installdocs: docs install -d $(DESTDIR)$(MAN_DIR)/man1 install -m644 tweeper.1 $(DESTDIR)$(MAN_DIR)/man1 install: installdocs install -d $(DESTDIR)$(TWEEPER_DIR) install -m644 *.xsl $(DESTDIR)$(TWEEPER_DIR) install -m644 *.php $(DESTDIR)$(TWEEPER_DIR) install -m755 tweeper $(DESTDIR)$(TWEEPER_DIR) install -d $(DESTDIR)$(BIN_DIR) ln -sf $(TWEEPER_DIR)/tweeper $(DESTDIR)$(BIN_DIR)/tweeper @echo -e "\n\nINTALLATION COMPLETE" @echo -e "Make sure '$(PHP_SCRIPT_DIR)' is in PHP include_path!\n" tweeper-0.2/NEWS000066400000000000000000000004771224240276200135360ustar00rootroot00000000000000News for v0.2: ============== * Small fixes to the man page which must be in the coming Debian package News for v0.1: ============== * Initial version supporting Twitter.com * Add support for pump.io sites * Add support for dilbert.com * Show links to supported media files in the RSS element tweeper-0.2/README000066400000000000000000000036201224240276200137100ustar00rootroot00000000000000Tweeper is a web scraper which extracts the most recent public tweets of a given user from their home page on Twitter.com and formats them in RSS, so the information can be conveniently accessed and collected by a feed reader. Since Jun 11th 2013 Twitter.com retired their API v1.0, so it's not possible to access a user timeline via RSS anymore, and it's also become mandatory to authenticate via OAuth to access this _public_ information in JSON format: https://dev.twitter.com/discussions/16289 https://dev.twitter.com/discussions/11564 Some services came up to overcome this "problem": http://twss.55uk.net/ http://twitter-rss.com/ (now redirecting to google.com) http://rssitfor.me However these solutions are still shady and let no control to the user about who collects the information about the visited user timelines. This is why Tweeper[1] was born, as an Open Source way to keep following your friends with a certain degree of anonymity, without having to tell Twitter.com whom you are friend to. [1] http://www.urbandictionary.com/define.php?term=TWEEPER&defid=3743173 Tweeper can be used via web or as a command line program, for example as a filter in your feed reader, by passing the URL of the user's public timeline as the first argument. Tweeper can easily scrape sites other than Twitter, it is just a matter of writing an xsl stylesheet for the transformation; an example for pump.io activity stream is provided in rss_converter_pump.io.xsl Example of use on the command line: $ php tweeper.php http://twitter.com/NSACareers Example of use as a Liferea[2] filter: $ liferea-add-feed "|php .../path_to_tweeper/tweeper.php http://twitter.com/NSAcareers" Example of use with identi.ca: $ liferea-add-feed "|php .../path_to_tweeper/tweeper.php http://identi.ca/evan" [2] http://lzone.de/liferea/ Tweeper is licensed under the GPLv3. Tweeper was written by Antonio Ospite http://ao2.it tweeper-0.2/TODO000066400000000000000000000007601224240276200135220ustar00rootroot00000000000000- write a better XSL stylesheet? I am not an XSL expert. - evaluate the use of the RSS element. - use the element for pump.io media objects - use the element for images on dilbert.com - consider using http://www.dilbert.com/fast for dilbert.com - debug some duplicated entries in the tweeter feeds in liferea - check the encoding of the tweets when UTF is used, maybe solvable with mb_convert_encoding()? See http://php.net/manual/en/domdocument.loadhtml.php tweeper-0.2/rss_converter_dilbert.com.xsl000066400000000000000000000065651224240276200207530ustar00rootroot00000000000000 <xsl:value-of select="$picture-element/@title"/> <![CDATA[ ]]> Tweeper <xsl:value-of select="//meta[@property='og:title']/@content"/> tweeper-0.2/rss_converter_identi.ca.xsl000077700000000000000000000000001224240276200254632rss_converter_pump.io.xslustar00rootroot00000000000000tweeper-0.2/rss_converter_pump.io.xsl000066400000000000000000000062451224240276200201330ustar00rootroot00000000000000 <xsl:value-of select="concat($user-name, ': ', normalize-space($activity-text))"/> <![CDATA[ ]]> Tweeper <xsl:value-of select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/> tweeper-0.2/rss_converter_twitter.com.xsl000066400000000000000000000071471224240276200210250ustar00rootroot00000000000000 https://twitter.com <xsl:value-of select="concat($user-name, ': ', $tweet-text)"/> <![CDATA[ ]]> Tweeper <xsl:text>Twitter / </xsl:text><xsl:value-of select="$screen-name"/> tweeper-0.2/tweeper000077500000000000000000000001071224240276200144260ustar00rootroot00000000000000#!/usr/bin/env php element *-h, --help*:: show the help message EXAMPLE OF USE -------------- Getting the RSS feed of some Twitter user: tweeper http://twitter.com/NSACareers Using tweeper as a filter for the Liferea feed reader: liferea-add-feed "|tweeper http://twitter.com/NSAcareers" EXIT STATUS ----------- *0*:: Success *!0*:: Failure AUTHORS ------- Antonio Ospite RESOURCES --------- Main web site: COPYING ------- Copyright \(C) 2013 Antonio Ospite This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. tweeper-0.2/tweeper.php000066400000000000000000000153541224240276200152230ustar00rootroot00000000000000 * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ date_default_timezone_set('UTC'); class Tweeper { private static $USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0"; public function __construct($stylesheet, $generate_enclosure = FALSE) { $stylesheet_contents = $this->get_contents($stylesheet); $xslDoc = new DOMDocument(); $xslDoc->loadXML($stylesheet_contents); $this->xsltProcessor = new XSLTProcessor(); $this->xsltProcessor->registerPHPFunctions(); $this->xsltProcessor->setParameter('', 'generateEnclosure', $generate_enclosure); $this->xsltProcessor->importStylesheet($xslDoc); } public static function epoch_to_gmdate($timestamp) { return gmdate('D, d M Y H:i:s', $timestamp) . ' GMT'; } public static function str_to_gmdate($date) { $timestamp = strtotime($date); return Tweeper::epoch_to_gmdate($timestamp); } private static function get_contents($url) { $ch = curl_init($url); curl_setopt_array($ch, array( CURLOPT_HEADER => FALSE, CURLOPT_FOLLOWLOCATION => TRUE, // follow http redirects to get the real URL CURLOPT_RETURNTRANSFER => TRUE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_HTTPHEADER => array('Accept-language: en'), CURLOPT_USERAGENT => Tweeper::$USER_AGENT, )); $contents = curl_exec($ch); curl_close($ch); return $contents; } private static function get_info($url) { $ch = curl_init($url); curl_setopt_array($ch, array( CURLOPT_HEADER => TRUE, CURLOPT_NOBODY => TRUE, CURLOPT_FOLLOWLOCATION => TRUE, // follow http redirects to get the real URL CURLOPT_RETURNTRANSFER => TRUE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_USERAGENT => Tweeper::$USER_AGENT, )); curl_exec($ch); $url_info = curl_getinfo($ch); curl_close($ch); return $url_info; } public static function generate_enclosure($url) { $supported_content_types = array( "application/ogg", "audio/aac", "audio/mp4", "audio/mpeg", "audio/vorbis", "audio/wav", "audio/webm", "audio/x-midi", "image/gif", "image/jpeg", "video/avi", "video/mp4", "video/mpeg", "video/ogg", ); $url_info = Tweeper::get_info($url); $supported = in_array($url_info['content_type'], $supported_content_types); if (!$supported) { error_log("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url']); return ''; } $dom = new DomDocument(); $enc = $dom->createElement('enclosure'); $enc->setAttribute('url', $url_info['url']); $enc->setAttribute('length', $url_info['download_content_length']); $enc->setAttribute('type', $url_info['content_type']); $dom->appendChild($enc); return $dom->saveXML($enc); } /* Mimic the message from libxml.c::php_libxml_ctx_error_level() */ private function log_xml_error($error) { $output = ""; switch ($error->level) { case LIBXML_ERR_WARNING: $output .= "Warning $error->code: "; break; case LIBXML_ERR_ERROR: $output .= "Error $error->code: "; break; case LIBXML_ERR_FATAL: $output .= "Fatal Error $error->code: "; break; } $output .= trim($error->message); if ($error->file) { $output .= " in $error->file"; } else { $output .= " in Entity,"; } $output .=" line $error->line"; error_log($output); } public function tweep($uri) { $html = Tweeper::get_contents($uri); $xmlDoc = new DOMDocument(); // Handle warnings and errors when loading invalid HTML. $xml_errors_value = libxml_use_internal_errors(true); $xmlDoc->loadHTML($html); foreach (libxml_get_errors() as $xml_error) { $this->log_xml_error($xml_error); } libxml_clear_errors(); libxml_use_internal_errors($xml_errors_value); $output = $this->xsltProcessor->transformToXML($xmlDoc); if (FALSE === $output) { trigger_error('XSL transformation failed.', E_USER_ERROR); return NULL; } return $output; } } function usage($argv) { if (php_sapi_name() != 'cli') $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=&generate_enclosure=<0|1>"); else $usage = "{$argv[0]} [-e|-h|--help] \n"; return "usage: $usage"; } function parse_options_cli($argv, $argc) { $options = array( 'generate_enclosure' => FALSE ); if ($argc < 2) return $options; $cli_options = getopt("eh", array("help")); foreach ($cli_options as $opt => $val) { switch ($opt) { case 'e': $options['generate_enclosure'] = TRUE; break; case 'h': case 'help': echo usage($argv); exit(0); default: fwrite(STDERR, usage($argv)); exit(1); } } $options['src_url'] = $argv[count($cli_options) + 1]; return $options; } function parse_options_query_string() { $options = array( 'generate_enclosure' => FALSE ); if (isset($_GET['src_url'])) $options['src_url'] = $_GET['src_url']; if (isset($_GET['generate_enclosure'])) $options['generate_enclosure'] = $_GET['generate_enclosure'] == 1; return $options; } if (php_sapi_name() != 'cli') { $options = parse_options_query_string(); $ERROR_STREAM = fopen('php://output', 'w'); } else { $options = parse_options_cli($argv, $argc); $ERROR_STREAM = fopen('php://stderr', 'w'); } if (!isset($options['src_url'])) { fwrite($ERROR_STREAM, usage($argv)); exit(1); } $url = parse_url($options['src_url']); if (FALSE === $url || empty($url["host"])) { fwrite($ERROR_STREAM, "Invalid url: ${options['src_url']}\n"); exit(1); } $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $url["host"] . ".xsl"; if (FALSE === file_exists($stylesheet)) { fwrite($ERROR_STREAM, "Conversion to RSS not supported: {$url["host"]}\n"); exit(1); } $tweeper = new Tweeper($stylesheet, $options['generate_enclosure']); echo $tweeper->tweep($options['src_url']);