-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathcrawl.rb
95 lines (80 loc) · 2.56 KB
/
crawl.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# frozen_string_literal: true
require 'validate_website/core'
require 'validate_website/utils'
module ValidateWebsite
# Class for http website validation
class Crawl < Core
attr_reader :crawler
def initialize(options = {}, validation_type = :crawl)
super
start_message(@site)
end
def history_count
crawler.history.size
end
# @param [Hash] options
# :color [Boolean] color output (true, false)
# :exclude [String] a String used by Regexp.new
# :markup [Boolean] Check the markup validity
# :not_found [Boolean] Check for not found page (404)
#
def crawl(options = {})
@options = @options.merge(options)
@options[:ignore_links] = @options[:exclude] if @options[:exclude]
@crawler = spidr_crawler(@site, @options)
print_status_line(@crawler.history.size,
@crawler.failures.size,
@not_founds_count,
@errors_count)
end
private
# Extract imgs urls from page
#
# @param [Spidr::Page] an Spidr::Page object
# @return [Array] Lists of urls
#
def extract_imgs_from_page(page)
return Set[] if page.is_redirect?
page.doc.search('//img[@src]').reduce(Set[]) do |result, elem|
u = elem.attributes['src'].content
result << page.to_absolute(URI.parse(WEBrick::HTTPUtils.escape(u)))
end
end
def spidr_crawler(site, options)
@host = URI(site).host
Spidr.site(site, **options.slice(:user_agent, :ignore_links)) do |crawler|
crawler.cookies[@host] = default_cookies if options[:cookies]
on_every_css_page(crawler)
on_every_html_page(crawler)
on_every_failed_url(crawler) if options[:not_found]
end
end
def on_every_css_page(crawler)
crawler.every_css_page do |page|
check_css_syntax(page) if options[:css_syntax]
ValidateWebsite::Utils.extract_urls_from_css(page).each do |u|
crawler.enqueue(u)
end
end
end
def validate?(page)
options[:markup] && page.html? && !page.is_redirect?
end
def on_every_html_page(crawler)
crawler.every_html_page do |page|
extract_imgs_from_page(page).each do |i|
crawler.enqueue(i)
end
if validate?(page)
keys = %i[ignore html5_validator]
validate(page.doc, page.body, page.url, options.slice(keys))
end
end
end
def on_every_failed_url(crawler)
crawler.every_failed_url do |url|
not_found_error(url)
end
end
end
end