pax_global_header 0000666 0000000 0000000 00000000064 15057613507 0014523 g ustar 00root root 0000000 0000000 52 comment=cea4f40b0adabdac09f63fe800bab9e9b35fb5a8 johnnagro-spider-e2300ba/ 0000775 0000000 0000000 00000000000 15057613507 0015372 5 ustar 00root root 0000000 0000000 johnnagro-spider-e2300ba/.gitignore 0000664 0000000 0000000 00000000126 15057613507 0017361 0 ustar 00root root 0000000 0000000 *.gem /Gemfile.lock /.bundle /vendor /doc /pkg /rdoc /.yardoc dump.rdb .tool-versions johnnagro-spider-e2300ba/.rubocop.yml 0000664 0000000 0000000 00000000106 15057613507 0017641 0 ustar 00root root 0000000 0000000 Style/Documentation: Enabled: false Metrics/LineLength: Max: 200 johnnagro-spider-e2300ba/AUTHORS 0000664 0000000 0000000 00000000611 15057613507 0016440 0 ustar 00root root 0000000 0000000 The Ruby Spider Gem would not be what it is today without the help of the following kind souls: @apfeltee Alexandre Rousseau Brian Campbell Henri Cook James Edward Gray II Jeremy Evans Joao Eriberto Mota Filho John Buckley John Nagro (project maintainer) Josef Strzibny (@strzibny) Matt Horan Marc (@brigriffin) Mike Burns (original author) Olle Jonsson Sander van der Vliet Stuart Yamartino johnnagro-spider-e2300ba/CHANGELOG.md 0000664 0000000 0000000 00000005135 15057613507 0017207 0 ustar 00root root 0000000 0000000 # Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.7.0] - 2025-09-08 ### Added - more efficient URL scanning, skipping non-html content. thanks to @apfeltee - honor "nofollow" link `rel` attributes. thanks to @strzibny ## [0.6.0] - 2025-09-08 ### Added - updates to gemspec such as minimum ruby version - markdown format changelog - rebuilt test suite with Minitest - mock memcached and redis-server for testing ### Fixed - better handling of port numbers - better handling of status codes ## [0.5.3] - 2018-04-23 * release simply to add missing CHANGES notes ## [0.5.2] - 2018-04-23 * fixed #2 thanks to @jeremyevans * added Redis as cache wrapper thanks to @brigriffin ## [0.5.1] - 2016-09-04 v * added the ability to stop a crawl ## [0.5.0] - 2016-05-13 * fixed #1 thanks to @eribertomota * got it running on more recent versions of ruby * cleaned up the docs a bit * cleaned up the licensing and attribution ## 2009-05-21 * fixed an issue with robots.txt on ssl hosts * fixed an issue with pulling robots.txt from disallowed hosts * fixed a documentation error with ExpiredLinks * Many thanks to Brian Campbell ## 2008-10-09 * fixed a situation with nested slashes in urls, thanks to Sander van der Vliet and John Buckley ## 2008-07-06 * Trap interrupts and shutdown gracefully * Support for custom urls-to-crawl objects * Example AmazonSQS urls-to-crawl support (next_urls_in_sqs.rb) ## 2007-11-09: * Handle redirects that assume a base URL. ## 2007-11-08: * Move spider_instance.rb, robot_rules.rb, and included_in_memcached.rb into spider subdirectory. ## 2007-11-02: * Memcached support. ## 2007-10-31: * Add `setup' and `teardown' handlers. * Can set the headers for a HTTP request. * Changed :any to :every . * Changed the arguments to the :every, :success, :failure, and code handler. ## [0.2.1] - 2007-10-23: * URLs without a page component but with a query component. * HTTP Redirect. * HTTPS. * Version 0.2.1 . ## [0.2.0] - 2007-10-22: * Use RSpec to ensure that it mostly works. * Use WEBrick to create a small test server for additional testing. * Completely re-do the API to prepare for future expansion. * Add the ability to apply each URL to a series of custom allowed?-like matchers. * BSD license. * Version 0.2.0 . ## 2007-03-30: * Clean up the documentation. ## [0.1.0] - 2007-03-28: * Change the tail recursion to a `while' loop, to please Ruby. * Documentation. * Initial release: version 0.1.0 . johnnagro-spider-e2300ba/Gemfile 0000664 0000000 0000000 00000000566 15057613507 0016674 0 ustar 00root root 0000000 0000000 # frozen_string_literal: true source "https://rubygems.org" # Specify your gem's dependencies in active_record_pretty_key.gemspec gemspec group :development, :test do gem "rake", "~> 13.0" gem "minitest", "~> 5.0" gem "webrick", "~> 1.9" gem "dalli", "~> 3.2" gem "redis", "~> 5.4" end group :development do gem "bundler", "~> 2.0" gem "pry", "~> 0.14" end johnnagro-spider-e2300ba/LICENSE 0000664 0000000 0000000 00000002103 15057613507 0016373 0 ustar 00root root 0000000 0000000 The MIT License (MIT) Copyright (c) 2007-2025 Spider Team Authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. johnnagro-spider-e2300ba/README.md 0000664 0000000 0000000 00000007130 15057613507 0016652 0 ustar 00root root 0000000 0000000 # Spider _a Web spidering library for Ruby. It handles the robots.txt, scraping, collecting, and looping so that you can just handle the data._ ## Examples ### Crawl the Web, loading each page in turn, until you run out of memory ```ruby require 'spider' Spider.start_at('http://cashcats.biz/') {} ``` ### To handle erroneous responses ```ruby require 'spider' Spider.start_at('http://cashcats.biz/') do |s| s.on :failure do |a_url, resp, prior_url| puts "URL failed: #{a_url}" puts " linked from #{prior_url}" end end ``` ### Or handle successful responses ```ruby require 'spider' Spider.start_at('http://cashcats.biz/') do |s| s.on :success do |a_url, resp, prior_url| puts "#{a_url}: #{resp.code}" puts resp.body puts end end ``` ### Limit to just one domain ```ruby require 'spider' Spider.start_at('http://cashcats.biz/') do |s| s.add_url_check do |a_url| a_url =~ %r{^http://cashcats.biz.*} end end ``` ### Pass headers to some requests ```ruby require 'spider' Spider.start_at('http://cashcats.biz/') do |s| s.setup do |a_url| if a_url =~ %r{^http://.*wikipedia.*} headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" end end end ``` ### Use memcached to track cycles ```ruby require 'spider' require 'spider/included_in_memcached' SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211'] Spider.start_at('http://cashcats.biz/') do |s| s.check_already_seen_with IncludedInMemcached.new(SERVERS) end ``` ### Use Redis to track cycles ```ruby require 'spider' require 'spider/included_in_redis' Spider.start_at('http://cashcats.biz/') do |s| s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379) end ``` ### Use Plain text to track cycles ```ruby require 'spider' require 'spider/included_in_file' Spider.start_at('http://cashcats.biz/') do |s| s.check_already_seen_with IncludedInFile.new('/tmp/cashcats_crawl.txt') end ``` ### Track cycles with a custom object ```ruby require 'spider' class ExpireLinks < Hash def <<(v) self[v] = Time.now end def include?(v) self[v].kind_of?(Time) && (self[v] + 86400) >= Time.now end end Spider.start_at('http://cashcats.biz/') do |s| s.check_already_seen_with ExpireLinks.new end ``` ### Store nodes to visit with Amazon SQS ```ruby require 'spider' require 'spider/next_urls_in_sqs' Spider.start_at('http://cashcats.biz') do |s| s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY) end ``` ### Store nodes to visit with a custom object ```ruby require 'spider' class MyArray < Array def pop super end def push(a_msg) super(a_msg) end end Spider.start_at('http://cashcats.biz') do |s| s.store_next_urls_with MyArray.new end ``` ### Create a URL graph ```ruby require 'spider' nodes = {} Spider.start_at('http://cashcats.biz/') do |s| s.add_url_check {|a_url| a_url =~ %r{^http://cashcats.biz.*} } s.on(:every) do |a_url, resp, prior_url| nodes[prior_url] ||= [] nodes[prior_url] << a_url end end ``` ### Use a proxy ```ruby require 'net/http_configuration' require 'spider' http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org', :proxy_port => 8881) http_conf.apply do Spider.start_at('http://img.4chan.org/b/') do |s| s.on(:success) do |a_url, resp, prior_url| File.open(a_url.gsub('/',':'),'w') do |f| f.write(resp.body) end end end end ``` _Copyright (c) 2007-2025 Spider Team Authors_ johnnagro-spider-e2300ba/Rakefile 0000664 0000000 0000000 00000000247 15057613507 0017042 0 ustar 00root root 0000000 0000000 require 'rake/testtask' Rake::TestTask.new(:test) do |t| t.libs << 'test' t.libs << 'lib' t.test_files = FileList['test/**/test_*.rb'] end task default: :test johnnagro-spider-e2300ba/VERSION 0000664 0000000 0000000 00000000005 15057613507 0016435 0 ustar 00root root 0000000 0000000 0.7.0 johnnagro-spider-e2300ba/lib/ 0000775 0000000 0000000 00000000000 15057613507 0016140 5 ustar 00root root 0000000 0000000 johnnagro-spider-e2300ba/lib/spider.rb 0000664 0000000 0000000 00000002362 15057613507 0017756 0 ustar 00root root 0000000 0000000 require File.dirname(__FILE__)+'/spider/spider_instance' # A spidering library for Ruby. Handles robots.txt, scraping, finding more # links, and doing it all over again. class Spider VERSION = File.read( File.expand_path('../VERSION', __dir__) ).strip.freeze def self.version VERSION end # Runs the spider starting at the given URL. Also takes a block that is given # the SpiderInstance. Use the block to define the rules and handlers for # the discovered Web pages. See SpiderInstance for the possible rules and # handlers. # # Spider.start_at('http://cashcats.biz/') do |s| # s.add_url_check do |a_url| # a_url =~ %r{^http://cashcats.biz.*} # end # # s.on 404 do |a_url, resp, prior_url| # puts "URL not found: #{a_url}" # end # # s.on :success do |a_url, resp, prior_url| # puts "body: #{resp.body}" # end # # s.on :every do |a_url, resp, prior_url| # puts "URL returned anything: #{a_url} with this code #{resp.code}" # end # end def self.start_at(a_url, &block) rules = RobotRules.new("Ruby Spider #{Spider::VERSION}") a_spider = SpiderInstance.new({nil => [a_url]}, [], rules, []) block.call(a_spider) a_spider.start! end end johnnagro-spider-e2300ba/lib/spider/ 0000775 0000000 0000000 00000000000 15057613507 0017426 5 ustar 00root root 0000000 0000000 johnnagro-spider-e2300ba/lib/spider/included_in_file.rb 0000664 0000000 0000000 00000002020 15057613507 0023221 0 ustar 00root root 0000000 0000000 # Use plain text file to track cycles. # A specialized class using a plain text to track items stored. It supports # three operations: new, <<, and include? . Together these can be used to # add items to the text file, then determine whether the item has been added. # # To use it with Spider use the check_already_seen_with method: # # Spider.start_at('http://example.com/') do |s| # s.check_already_seen_with IncludedInFile.new('/tmp/crawled.log') # end class IncludedInFile # Construct a new IncludedInFile instance. # @param filepath [String] as path of file to store crawled URL def initialize(filepath) @filepath = filepath # create file if not exists File.write(@filepath, '') unless File.file?(@filepath) @urls = File.readlines(@filepath).map(&:chomp) end # Add an item to the file & array of URL. def <<(v) @urls << v.to_s File.write(@filepath, "#{v}\r\n", File.size(@filepath), mode: 'a') end # True if the item is in the file. def include?(v) @urls.include? v.to_s end end johnnagro-spider-e2300ba/lib/spider/included_in_memcached.rb 0000664 0000000 0000000 00000001721 15057613507 0024217 0 ustar 00root root 0000000 0000000 # Use memcached to track cycles. require 'dalli' # A specialized class using memcached to track items stored. It supports # three operations: new, <<, and include? . Together these can be used to # add items to the memcache, then determine whether the item has been added. # # To use it with Spider use the check_already_seen_with method: # # Spider.start_at('http://example.com/') do |s| # s.check_already_seen_with IncludedInMemcached.new('localhost:11211') # end class IncludedInMemcached # Construct a new IncludedInMemcached instance. The first argument should be # the memcached server address (e.g., 'localhost:11211'). Additional options # can be passed as a hash (see Dalli::Client documentation). def initialize(server, options = {}) @c = Dalli::Client.new(server, options) end # Add an item to the memcache. def <<(v) @c.add(v.to_s, v) end # True if the item is in the memcache. def include?(v) @c.get(v.to_s) == v end end johnnagro-spider-e2300ba/lib/spider/included_in_redis.rb 0000664 0000000 0000000 00000001503 15057613507 0023415 0 ustar 00root root 0000000 0000000 # Use Redis to track cycles. require 'redis' require 'json' # A specialized class using Redis to track items stored. It supports # three operations: new, <<, and include? . Together these can be used to # add items to Redis, then determine whether the item has been added. # # To use it with Spider use the check_already_seen_with method: # # Spider.start_at('http://example.com/') do |s| # s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379) # end class IncludedInRedis # Construct a new IncludedInRedis instance. All arguments here are # passed to Redis (part of the redis gem). def initialize(*a) @c = Redis.new(*a) end # Add an item to Redis def <<(v) @c.set(v.to_s, v.to_json) end # True if the item is in Redis def include?(v) @c.get(v.to_s) == v.to_json end end johnnagro-spider-e2300ba/lib/spider/next_urls_in_sqs.rb 0000664 0000000 0000000 00000002533 15057613507 0023355 0 ustar 00root root 0000000 0000000 # Use AmazonSQS to track nodes to visit. require 'rubygems' require 'right_aws' require 'yaml' # A specialized class using AmazonSQS to track nodes to walk. It supports # two operations: push and pop . Together these can be used to # add items to the queue, then pull items off the queue. # # This is useful if you want multiple Spider processes crawling the same # data set. # # To use it with Spider use the store_next_urls_with method: # # Spider.start_at('http://example.com/') do |s| # s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name) # end class NextUrlsInSQS # Construct a new NextUrlsInSQS instance. All arguments here are # passed to RightAWS::SqsGen2 (part of the right_aws gem) or used # to set the AmazonSQS queue name (optional). def initialize(aws_access_key, aws_secret_access_key, queue_name = 'ruby-spider') @sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key) @queue = @sqs.queue(queue_name) end # Pull an item off the queue, loop until data is found. Data is # encoded with YAML. def pop while true message = @queue.pop return YAML::load(message.to_s) unless message.nil? sleep 5 end end # Put data on the queue. Data is encoded with YAML. def push(a_msg) encoded_message = YAML::dump(a_msg) @queue.push(a_msg) end end johnnagro-spider-e2300ba/lib/spider/robot_rules.rb 0000664 0000000 0000000 00000004244 15057613507 0022316 0 ustar 00root root 0000000 0000000 #!/usr/local/bin/ruby -w # robot_rules.rb # # Created by James Edward Gray II on 2006-01-31. # Copyright 2006 Gray Productions. All rights reserved. # https://github.com/eribertomota/robot_rules.rb # https://github.com/johnnagro/spider/issues/1 require "uri" # Based on Perl's WWW::RobotRules module, by Gisle Aas. class RobotRules def initialize( user_agent ) @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*}, "").downcase @rules = Hash.new { |rules, rule| rules[rule] = Array.new } end def parse( text_uri, robots_data ) uri = URI.parse(text_uri) location = "#{uri.host}:#{uri.port}" @rules.delete(location) rules = robots_data.split(/[\015\012]+/). map { |rule| rule.sub(/\s*#.*$/, "") } anon_rules = Array.new my_rules = Array.new current = anon_rules rules.each do |rule| case rule when /^\s*User-Agent\s*:\s*(.+?)\s*$/i break unless my_rules.empty? current = if $1 == "*" anon_rules elsif $1.downcase.index(@user_agent) my_rules else nil end when /^\s*Disallow\s*:\s*(.*?)\s*$/i next if current.nil? if $1.empty? current << nil else disallow = URI.parse($1) next unless disallow.scheme.nil? or disallow.scheme == uri.scheme next unless disallow.port.nil? or disallow.port == uri.port next unless disallow.host.nil? or disallow.host.downcase == uri.host.downcase disallow = disallow.path disallow = "/" if disallow.empty? disallow = "/#{disallow}" unless disallow[0] == ?/ current << disallow end end end @rules[location] = if my_rules.empty? anon_rules.compact else my_rules.compact end end def allowed?( text_uri ) uri = URI.parse(text_uri) location = "#{uri.host}:#{uri.port}" path = uri.path return true unless %w{http https}.include?(uri.scheme) not @rules[location].any? { |rule| path.index(rule) == 0 } end end johnnagro-spider-e2300ba/lib/spider/spider_instance.rb 0000664 0000000 0000000 00000024130 15057613507 0023125 0 ustar 00root root 0000000 0000000 # Specialized spidering rules. require File.dirname(__FILE__)+'/robot_rules.rb' require 'open-uri' require 'uri' require 'net/http' require 'net/https' module Net #:nodoc: class HTTPResponse #:nodoc: def success?; false; end def redirect?; false; end end class HTTPSuccess #:nodoc: def success?; true; end end class HTTPRedirection #:nodoc: def redirect?; true; end end end class NilClass #:nodoc: def merge(h); h; end end class SpiderInstance def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc: @url_checks = [] @cache = :memory @callbacks = {} @next_urls = [next_urls] @seen = seen @rules = rules || RobotRules.new("Ruby Spider #{Spider::VERSION}") @robots_seen = robots_seen @headers = {} @setup = nil @teardown = nil @interrupted = false end # Add a predicate that determines whether to continue down this URL's path. # All predicates must be true in order for a URL to proceed. # # Takes a block that takes a string and produces a boolean. For example, this # will ensure that the URL starts with 'http://cashcats.biz': # # add_url_check { |a_url| a_url =~ %r{^http://cashcats.biz.*} def add_url_check(&block) @url_checks << block end # The Web is a graph; to avoid cycles we store the nodes (URLs) already # visited. The Web is a really, really, really big graph; as such, this list # of visited nodes grows really, really, really big. # # Change the object used to store these seen nodes with this. The default # object is an instance of Array. Available with Spider is a wrapper of # memcached. # # You can implement a custom class for this; any object passed to # check_already_seen_with must understand just << and included? . # # # default # check_already_seen_with Array.new # # # memcached # require 'spider/included_in_memcached' # check_already_seen_with IncludedInMemcached.new('localhost:11211') def check_already_seen_with(cacher) if cacher.respond_to?(:<<) && cacher.respond_to?(:include?) @seen = cacher else raise ArgumentError, 'expected something that responds to << and included?' end end # The Web is a really, really, really big graph; as such, this list # of nodes to visit grows really, really, really big. # # Change the object used to store nodes we have yet to walk. The default # object is an instance of Array. Available with Spider is a wrapper of # AmazonSQS. # # You can implement a custom class for this; any object passed to # check_already_seen_with must understand just push and pop . # # # default # store_next_urls_with Array.new # # # AmazonSQS # require 'spider/next_urls_in_sqs' # store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name) def store_next_urls_with(a_store) tmp_next_urls = @next_urls @next_urls = a_store tmp_next_urls.each do |a_url_hash| @next_urls.push a_url_hash end end # Add a response handler. A response handler's trigger can be :every, # :success, :failure, or any HTTP status code. The handler itself can be # either a Proc or a block. # # The arguments to the block are: the URL as a string, an instance of # Net::HTTPResponse, and the prior URL as a string. # # # For example: # # on 404 do |a_url, resp, prior_url| # puts "URL not found: #{a_url}" # end # # on :success do |a_url, resp, prior_url| # puts a_url # puts resp.body # end # # on :every do |a_url, resp, prior_url| # puts "Given this code: #{resp.code}" # end def on(code, p = nil, &block) f = p ? p : block case code when Integer @callbacks[code] = f else @callbacks[code.to_sym] = f end end # Run before the HTTP request. Given the URL as a string. # setup do |a_url| # headers['Cookies'] = 'user_id=1;admin=true' # end def setup(p = nil, &block) @setup = p ? p : block end # Run last, once for each page. Given the URL as a string. def teardown(p = nil, &block) @teardown = p ? p : block end # Use like a hash: # headers['Cookies'] = 'user_id=1;password=btrross3' def headers HeaderSetter.new(self) end def raw_headers #:nodoc: @headers end def raw_headers=(v) #:nodoc: @headers = v end # Reset the headers hash. def clear_headers @headers = {} end def start! #:nodoc: trap("SIGINT") { @interrupted = true } begin next_urls = @next_urls.pop next_urls.each do |prior_url, urls| urls = [urls] unless urls.kind_of?(Array) urls.map do |a_url| [a_url, (URI.parse(a_url) rescue nil)] end.select do |a_url, parsed_url| allowable_url?(a_url, parsed_url) end.each do |a_url, parsed_url| @setup.call(a_url) unless @setup.nil? get_page(parsed_url) do |response| do_callbacks(a_url, response, prior_url) generate_next_urls(a_url, response).each do |a_next_url| @next_urls.push a_url => a_next_url end end @teardown.call(a_url) unless @teardown.nil? break if @interrupted end end end while !@next_urls.empty? && !@interrupted end def stop! #:nodoc: @interrupted = true end def success_or_failure(code) #:nodoc: if code > 199 && code < 300 :success else :failure end end def allowable_url?(a_url, parsed_url) #:nodoc: !parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) && @url_checks.map{|url_check|url_check.call(a_url)}.all? end # True if the robots.txt for that URL allows access to it. def allowed?(a_url, parsed_url) # :nodoc: return false unless ['http','https'].include?(parsed_url.scheme) u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt" parsed_u = URI.parse(u) return false unless @url_checks.map{|url_check|url_check.call(a_url)}.all? begin unless @robots_seen.include?(u) #open(u, 'User-Agent' => 'Ruby Spider', # 'Accept' => 'text/html,text/xml,application/xml,text/plain', :ssl_verify => false) do |url| # @rules.parse(u, url.read) #end get_page(parsed_u) do |r| @rules.parse(u, r.body) end @robots_seen << u end @rules.allowed?(a_url) rescue OpenURI::HTTPError true # No robots.txt rescue Exception, Timeout::Error # to keep it from crashing false end end def get_page(parsed_url, &block) #:nodoc: @seen << parsed_url begin http = Net::HTTP.new(parsed_url.host, parsed_url.port) if parsed_url.scheme == 'https' http.use_ssl = true http.verify_mode = OpenSSL::SSL::VERIFY_NONE end # Uses start because http.finish cannot be called. r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri, @headers))} if r.redirect? get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block) else block.call(r) end rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e p e nil end end def do_callbacks(a_url, resp, prior_url) #:nodoc: cbs = [@callbacks[:every], resp.success? ? @callbacks[:success] : @callbacks[:failure], @callbacks[resp.code.to_i]] cbs.each do |cb| cb.call(a_url, resp, prior_url) if cb end end def generate_next_urls(a_url, resp) #:nodoc: # Only scan for links if the content-type is HTML or the URL ends with .html content_type = resp['Content-Type'] || resp['content-type'] || '' url_ends_with_html = a_url.downcase.end_with?('.html') unless content_type.downcase.include?('text/html') || url_ends_with_html return [] end web_page = resp.body base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten + [a_url[0,a_url.rindex('/')]])[0] base_url = remove_trailing_slash(base_url) # Extract anchor tags with href attributes, respecting rel="nofollow" web_page.scan(/]*href="([^"]*)"[^>]*>/i).flatten.map do |link| # Get the full anchor tag to check for rel attribute anchor_match = web_page.match(/]*href="#{Regexp.escape(link)}"[^>]*>/i) next nil unless anchor_match anchor_tag = anchor_match[0] # Check if this link has rel="nofollow" or similar attributes that should be respected if anchor_tag.match(/rel\s*=\s*["']([^"']*nofollow[^"']*)["']/i) || anchor_tag.match(/rel\s*=\s*["']([^"']*sponsored[^"']*)["']/i) || anchor_tag.match(/rel\s*=\s*["']([^"']*ugc[^"']*)["']/i) next nil # Skip links with nofollow, sponsored, or ugc rel attributes end begin parsed_link = URI.parse(link) if parsed_link.fragment == '#' nil else construct_complete_url(base_url, link, parsed_link) end rescue nil end end.compact end def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc: parsed_additional_url ||= URI.parse(additional_url) case parsed_additional_url.scheme when nil u = base_url.is_a?(URI) ? base_url : URI.parse(base_url) # Include port if it's not the default port port_part = (u.port && ((u.scheme == 'http' && u.port != 80) || (u.scheme == 'https' && u.port != 443))) ? ":#{u.port}" : "" if additional_url[0].chr == '/' "#{u.scheme}://#{u.host}#{port_part}#{additional_url}" elsif u.path.nil? || u.path == '' "#{u.scheme}://#{u.host}#{port_part}/#{additional_url}" elsif u.path[0].chr == '/' "#{u.scheme}://#{u.host}#{port_part}#{u.path}/#{additional_url}" else "#{u.scheme}://#{u.host}#{port_part}/#{u.path}/#{additional_url}" end else additional_url end end def remove_trailing_slash(s) #:nodoc: s.sub(%r{/*$},'') end class HeaderSetter #:nodoc: def initialize(si) @si = si end def []=(k,v) @si.raw_headers = @si.raw_headers.merge({k => v}) end end end johnnagro-spider-e2300ba/spider.gemspec 0000664 0000000 0000000 00000001254 15057613507 0020227 0 ustar 00root root 0000000 0000000 require 'rubygems' require File.expand_path('../lib/spider', __FILE__) Gem::Specification.new do |s| s.author = 'John Nagro' s.email = 'john.nagro@gmail.com' s.license = 'MIT' s.homepage = 'https://github.com/johnnagro/spider' s.required_ruby_version = '>= 2.5' s.name = 'spider' s.summary = 'A Web spidering library' s.files = Dir['lib/**/*'] + ['VERSION'] s.require_path = 'lib' s.description = <<-EOF A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again. EOF s.metadata["source_code_uri"] = s.homepage s.metadata["changelog_uri"] = "#{s.homepage}/blob/main/CHANGELOG.md" s.version = Spider::VERSION end johnnagro-spider-e2300ba/test/ 0000775 0000000 0000000 00000000000 15057613507 0016351 5 ustar 00root root 0000000 0000000 johnnagro-spider-e2300ba/test/mock_memcached.rb 0000664 0000000 0000000 00000001214 15057613507 0021613 0 ustar 00root root 0000000 0000000 # Mock memcached client for testing - no external dependencies required class MockMemcached def initialize(*args) @data = {} end def add(key, value) @data[key] = value unless @data.key?(key) end def get(key) @data[key] end def flush @data.clear end def set(key, value) @data[key] = value end def delete(key) @data.delete(key) end end # Mock version of IncludedInMemcached that uses MockMemcached class MockIncludedInMemcached def initialize(*args) @c = MockMemcached.new(*args) end def <<(v) @c.add(v.to_s, v) end def include?(v) @c.get(v.to_s) == v end end johnnagro-spider-e2300ba/test/mock_redis.rb 0000664 0000000 0000000 00000001106 15057613507 0021013 0 ustar 00root root 0000000 0000000 # Mock Redis client for testing - no external dependencies required require 'json' class MockRedis def initialize(*args) @data = {} end def set(key, value) @data[key] = value end def get(key) @data[key] end def flushdb @data.clear end def del(key) @data.delete(key) end end # Mock version of IncludedInRedis that uses MockRedis class MockIncludedInRedis def initialize(*args) @c = MockRedis.new(*args) end def <<(v) @c.set(v.to_s, v.to_json) end def include?(v) @c.get(v.to_s) == v.to_json end end johnnagro-spider-e2300ba/test/test_helper.rb 0000664 0000000 0000000 00000011377 15057613507 0021225 0 ustar 00root root 0000000 0000000 require 'rubygems' require 'minitest/autorun' require 'minitest/mock' require 'webrick' require_relative 'mock_memcached' require_relative 'mock_redis' def local_require(*files) files.each do |file| require File.dirname(__FILE__)+'/../lib/'+file end end def with_web_server(svlt) server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger, :AccessLog => []) server.mount('/', svlt) Thread.new {server.start} begin yield ensure server.shutdown end end def with_memcached(use_real: false) if use_real # Use real memcached for integration tests system('memcached -d -P /tmp/spider-memcached.pid') sleep 0.5 # Give memcached time to start begin yield ensure system('kill -KILL `cat /tmp/spider-memcached.pid`') if File.exist?('/tmp/spider-memcached.pid') end else # Use mock memcached for fast unit tests yield end end def static_server_pages ['http://localhost:8888/', 'http://localhost:8888/foo'] end class QueryServlet < WEBrick::HTTPServlet::AbstractServlet def do_GET(req, res) res['Content-type'] = 'text/plain' res.body = "response\n" end end class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet def do_GET(req, res) res['Content-type'] = 'text/html' if req.path == '/foo' res.body = <<-END a END else res.body = <<-END b END end end end class NotFoundServlet < WEBrick::HTTPServlet::AbstractServlet def do_GET(req, res) res.status = 404 res['Content-type'] = 'text/plain' res.body = "Not Found" end end class SuccessServlet < WEBrick::HTTPServlet::AbstractServlet def do_GET(req, res) res.status = 200 res['Content-type'] = 'text/html' res.body = "
Success" end end class JsonServlet < WEBrick::HTTPServlet::AbstractServlet def do_GET(req, res) res.status = 200 res['Content-type'] = 'application/json' res.body = '{"data": "Test Link", "message": "This should not extract links"}' end end class PlainTextServlet < WEBrick::HTTPServlet::AbstractServlet def do_GET(req, res) res.status = 200 res['Content-type'] = 'text/plain' res.body = "Plain text content with fake link that should not be extracted" end end class HtmlWithLinksServlet < WEBrick::HTTPServlet::AbstractServlet def do_GET(req, res) res.status = 200 res['Content-type'] = 'text/html' res.body = <<-HTML