Parent

Class/Module Index [+]

Quicksearch

RobotRules

Based on Perl's WWW::RobotRules module, by Gisle Aas.

Public Class Methods

new( user_agent ) click to toggle source
# File lib/spider/robot_rules.rb, line 10
def initialize( user_agent )
  @user_agent = user_agent.scan(/\S+/).first.sub(%{/.*}, "").downcase
  @rules      = Hash.new { |rules, rule| rules[rule] = Array.new }
end

Public Instance Methods

allowed?( text_uri ) click to toggle source
# File lib/spider/robot_rules.rb, line 68
def allowed?( text_uri )
  uri      = URI.parse(text_uri)
  location = "#{uri.host}:#{uri.port}"
  path     = uri.path

  return true unless %{http https}.include?(uri.scheme)

  not @rules[location].any? { |rule| path.index(rule) == 0 }
end
parse( text_uri, robots_data ) click to toggle source
# File lib/spider/robot_rules.rb, line 15
def parse( text_uri, robots_data )
  uri      = URI.parse(text_uri)
  location = "#{uri.host}:#{uri.port}"
  @rules.delete(location)

  rules      = robots_data.split(/[\0015\0012]+/).map do |rule|
    rule.sub(/\s*#.*$/, "")
  end
  anon_rules = Array.new
  my_rules   = Array.new
  current    = anon_rules
  rules.each do |rule|
    case rule
    when /^\s*User-Agent\s*:\s*(.+?)\s*$/
      break unless my_rules.empty?

      current = if $1 == "*"
                  anon_rules
                elsif $1.downcase.index(@user_agent)
                  my_rules
                else
                  nil
                end
    when /^\s*Disallow\s*:\s*(.*?)\s*$/
      next if current.nil?

      if $1.empty?
        current << nil
      else
        disallow = URI.parse($1)

        next unless disallow.scheme.nil? or disallow.scheme ==  
          uri.scheme
        next unless disallow.port.nil?   or disallow.port == uri.port
        next unless disallow.host.nil?   or
        disallow.host.downcase == uri.host.downcase

        disallow = disallow.path
        disallow = "/"            if disallow.empty?
        disallow = "/#{disallow}" unless disallow[0] == //

        current << disallow
      end
    end
  end

  @rules[location] = if my_rules.empty?
                       anon_rules.compact
                     else
                       my_rules.compact
                     end
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.