Based on Perl's WWW::RobotRules module, by Gisle Aas.
# File lib/spider/robot_rules.rb, line 68 def allowed?( text_uri ) uri = URI.parse(text_uri) location = "#{uri.host}:#{uri.port}" path = uri.path return true unless %{http https}.include?(uri.scheme) not @rules[location].any? { |rule| path.index(rule) == 0 } end
# File lib/spider/robot_rules.rb, line 15 def parse( text_uri, robots_data ) uri = URI.parse(text_uri) location = "#{uri.host}:#{uri.port}" @rules.delete(location) rules = robots_data.split(/[\0015\0012]+/).map do |rule| rule.sub(/\s*#.*$/, "") end anon_rules = Array.new my_rules = Array.new current = anon_rules rules.each do |rule| case rule when /^\s*User-Agent\s*:\s*(.+?)\s*$/ break unless my_rules.empty? current = if $1 == "*" anon_rules elsif $1.downcase.index(@user_agent) my_rules else nil end when /^\s*Disallow\s*:\s*(.*?)\s*$/ next if current.nil? if $1.empty? current << nil else disallow = URI.parse($1) next unless disallow.scheme.nil? or disallow.scheme == uri.scheme next unless disallow.port.nil? or disallow.port == uri.port next unless disallow.host.nil? or disallow.host.downcase == uri.host.downcase disallow = disallow.path disallow = "/" if disallow.empty? disallow = "/#{disallow}" unless disallow[0] == // current << disallow end end end @rules[location] = if my_rules.empty? anon_rules.compact else my_rules.compact end end
Generated with the Darkfish Rdoc Generator 2.