Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .rspec
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
--require spec_helper
--format documentation
--color
75 changes: 75 additions & 0 deletions .rubocop.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# human should be able to read 200 chars per line
Layout/LineLength:
Max: 200

# relax branch condition size, code can be too verbose
Metrics/AbcSize:
Max: 40

# short clear method name
Metrics/MethodLength:
Max: 25

# no performance implication
Style/OptionalBooleanParameter:
Enabled: false

# %i(array) is not common in Ruby
Style/SymbolArray:
Enabled: false

# too restrictive
Style/FrozenStringLiteralComment:
Enabled: false

# default complexity is low at 8
Metrics/PerceivedComplexity:
Max: 12

# if works as well as safe navigation (&.)
Style/SafeNavigation:
Enabled: false

# disable this cop, dont agree with it
Style/FetchEnvVar:
Enabled: false

Style/Documentation:
Enabled: false

# buggus check in Rubocop.
# SerpApiClient constructor is rated to 9
# def initialize(params = {})
Metrics/CyclomaticComplexity:
Max: 12

# There is a tradeoff between line length and line count.
Metrics/ClassLength:
Max: 140

# Keyword args are readable.
Metrics/ParameterLists:
CountKeywordArgs: false

# this rule doesn't always work well with Ruby
Layout/FirstHashElementIndentation:
Enabled: false

# Dir glob is already sorted, but the explicit sort documents intent.
Lint/RedundantDirGlobSort:
Enabled: false

# RSpec describe/context blocks legitimately span many lines
Metrics/BlockLength:
AllowedMethods:
- describe
- context

AllCops:
# hide message
SuggestExtensions: false
# show new cops
NewCops: enable
Exclude:
- "Gemfile"
- "benchmark/**/*"
1 change: 1 addition & 0 deletions .ruby-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
4.0.5
7 changes: 7 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# frozen_string_literal: true

source "https://rubygems.org"

gem "nokolexbor"
gem "rspec"
gem "rubocop", require: false
83 changes: 83 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
GEM
remote: https://rubygems.org/
specs:
ast (2.4.3)
diff-lcs (1.6.2)
json (2.19.8)
language_server-protocol (3.17.0.5)
lint_roller (1.1.0)
nokolexbor (0.7.0-x86_64-linux)
parallel (2.1.0)
parser (3.3.11.1)
ast (~> 2.4.1)
racc
prism (1.9.0)
racc (1.8.1)
rainbow (3.1.1)
regexp_parser (2.12.0)
rspec (3.13.2)
rspec-core (~> 3.13.0)
rspec-expectations (~> 3.13.0)
rspec-mocks (~> 3.13.0)
rspec-core (3.13.6)
rspec-support (~> 3.13.0)
rspec-expectations (3.13.5)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-mocks (3.13.8)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-support (3.13.7)
rubocop (1.87.0)
json (~> 2.3)
language_server-protocol (~> 3.17.0.2)
lint_roller (~> 1.1.0)
parallel (>= 1.10)
parser (>= 3.3.0.2)
rainbow (>= 2.2.2, < 4.0)
regexp_parser (>= 2.9.3, < 3.0)
rubocop-ast (>= 1.49.0, < 2.0)
ruby-progressbar (~> 1.7)
unicode-display_width (>= 2.4.0, < 4.0)
rubocop-ast (1.49.1)
parser (>= 3.3.7.2)
prism (~> 1.7)
ruby-progressbar (1.13.0)
unicode-display_width (3.2.0)
unicode-emoji (~> 4.1)
unicode-emoji (4.2.0)

PLATFORMS
x86_64-linux

DEPENDENCIES
nokolexbor
rspec
rubocop

CHECKSUMS
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
json (2.19.8) sha256=6354310fd76ef69b87d5bd1f38b40d730613baf90b6803d2d0a48f618d32dfaa
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
nokolexbor (0.7.0-x86_64-linux) sha256=6348178e41233e67e0f533f84b0b1974b187fe137616541f1453bb7c0c16baf6
parallel (2.1.0) sha256=b35258865c2e31134c5ecb708beaaf6772adf9d5efae28e93e99260877b09356
parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
regexp_parser (2.12.0) sha256=35a916a1d63190ab5c9009457136ae5f3c0c7512d60291d0d1378ba18ce08ebb
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
rubocop (1.87.0) sha256=b9d9ddf55116a513f8ef2c7ae660662d8b49301f118d3f0df61865b33a5c188d
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f

BUNDLED WITH
4.0.10
61 changes: 61 additions & 0 deletions lib/carousel_parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# frozen_string_literal: true

require 'nokolexbor'

class CarouselParser
THUMBNAIL_SELECTOR = 'img[data-deferred], img[data-src]'
CAROUSEL_LINK_MARKER = 'stick='
DEFERRED_IMAGE_REGEX = %r{var s='(data:image/[^']*)';var ii=\[([^\]]*)\]}
GOOGLE_BASE_URL = 'https://www.google.com'

def self.parse(html)
new.parse(html)
end

def parse(html)
doc = Nokolexbor::HTML(html)
deferred = deferred_images(html)

items = carousel_items(doc).map do |img, link|
build_item(img, link, deferred)
end

{ 'artworks' => items }
end

private

def deferred_images(html)
html.scan(DEFERRED_IMAGE_REGEX).each_with_object({}) do |(data, ids), map|
image = unescape_js(data)
ids.scan(/'([^']*)'/).each { |(id)| map[id] = image }
end
end

def carousel_items(doc)
doc.css(THUMBNAIL_SELECTOR).filter_map do |img|
link = img.ancestors('a').first
[img, link] if link && link['href'].to_s.include?(CAROUSEL_LINK_MARKER)
end
end

def build_item(img, link, deferred)
name, date = labels(link)

item = { 'name' => name }
item['extensions'] = [date] if !date.nil? && !date.empty?
item['link'] = GOOGLE_BASE_URL + link['href']
item['image'] = img['data-src'] || deferred[img['id']]
item
end

def labels(link)
divs = link.css('div').select { |d| d.children.all?(&:text?) && !d.text.strip.empty? }
divs.map { |d| d.text.gsub("\u00A0", ' ').strip }
end

# Google escapes certain bytes in Base64 encoded images, like "=" symbols.
def unescape_js(string)
string.gsub(/\\x([0-9a-fA-F]{2})/) { ::Regexp.last_match(1).to_i(16).chr }
end
end
Loading