Class: Workers::Spider

Inherits:
Object
  • Object
show all
Includes:
Ronin::App, Sidekiq::Worker
Defined in:
workers/spider.rb

Overview

Web spider worker.

Constant Summary collapse

Params =
Dry::Schema::JSON() do
  required(:type).filled(Types::Spider::TargetType)
  required(:target).filled(:string)

  optional(:host_header).maybe(:string)
  # optional(:host_headers)
  # optional(:default_headers)
  optional(:user_agent).maybe(:string)
  optional(:referer).maybe(:string)
  optional(:open_timeout).maybe(:integer)
  optional(:read_timeout).maybe(:integer)
  optional(:ssl_timeout).maybe(:integer)
  optional(:continue_timeout).maybe(:integer)
  optional(:keep_alive_timeout).maybe(:integer)
  optional(:proxy).maybe(:string)
  optional(:delay).maybe(:integer)
  optional(:limit).maybe(:integer)
  optional(:max_depth).maybe(:integer)
  optional(:strip_fragments).maybe(:bool)
  optional(:strip_query).maybe(:bool)
  optional(:hosts).maybe(:array)
  optional(:ignore_hosts).maybe(:array)
  optional(:ports).maybe(:array)
  optional(:ignore_ports).maybe(:array)
  optional(:urls).maybe(:array)
  optional(:ignore_urls).maybe(:array)
  optional(:exts).maybe(:array)
  optional(:ignore_exts).maybe(:array)
  optional(:robots).maybe(:bool)
end

Constants included from Ronin::App

Ronin::App::ROOT, Ronin::App::VERSION

Instance Method Summary collapse

Instance Method Details

#import_url(url) ⇒ Object

Imports a URL.

Parameters:

  • url (String, URI::HTTP)

    The URL or URI to import.



141
142
143
144
145
# File 'workers/spider.rb', line 141

def import_url(url)
  Ronin::DB::URL.transaction do
    Ronin::DB::URL.import(url)
  end
end

#perform(params) ⇒ Object

Processes a web spider job.

Parameters:

  • params (Hash{String => Object})

    The JSON deserialized params for the job.

Raises:

  • (ArgumentError)

    The params could not be validated or coerced.



77
78
79
80
81
82
83
84
85
86
87
88
# File 'workers/spider.rb', line 77

def perform(params)
  params = validate(params)
  type   = params.delete(:type)
  target = params.delete(:target)

  spider(type,target,**params) do |agent|
    agent.every_page do |page|
      puts page.url
      import_url(page.url)
    end
  end
end

#spider(type, target, **kwargs) {|agent| ... } ⇒ Object

Spiders a host, domain, or site.

Parameters:

  • type ("host", "domain", "site")

    Indicates whether to spider a host, domain, or site.

  • target (String)

    The host name, domain name, or website base URL to spider.

Yields:

  • (agent)

    The given block will be yielded the new web spider agent to configure.

Yield Parameters:

  • agent (Ronin::Web::Spider::Agent)

    The new web spider agent.



127
128
129
130
131
132
133
# File 'workers/spider.rb', line 127

def spider(type,target,**kwargs,&block)
  case type
  when 'host'   then Ronin::Web::Spider.host(target,**kwargs,&block)
  when 'domain' then Ronin::Web::Spider.domain(target,**kwargs,&block)
  when 'site'   then Ronin::Web::Spider.site(target,**kwargs,&block)
  end
end

#validate(params) ⇒ Hash{Symbol => Object}

Validates the given job params.

Parameters:

  • params (Hash{String => Object})

    The JSON deserialized params for the job.

Returns:

  • (Hash{Symbol => Object})

    The validated and coerced params as a Symbol Hash.

Raises:

  • (ArgumentError)

    The params could not be validated or coerced.



102
103
104
105
106
107
108
109
110
# File 'workers/spider.rb', line 102

def validate(params)
  result = Params.call(params)

  if result.failure?
    raise(ArgumentError,"invalid spider params (#{params.inspect}): #{result.errors.inspect}")
  end

  return result.to_h
end