Skip to content

Commit

Permalink
Merge pull request #87 from nappex/links-parser
Browse files Browse the repository at this point in the history
Add parser of html links
  • Loading branch information
Glutexo authored Feb 5, 2023
2 parents 076353f + 36a9324 commit fc4455f
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 1 deletion.
7 changes: 7 additions & 0 deletions lib/spider_html.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
defmodule Spider.HTML do
def find_links(document) do
Floki.parse_document!(document)
|> Floki.find("a")
|> Floki.attribute("href")
end
end
5 changes: 4 additions & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ defmodule Onigumo.MixProject do
# {:dep_from_hexpm, "~> 0.3.0"},
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"},
{:httpoison, "~> 1.8"},
{:mox, "~> 1.0", only: :test}
{:mox, "~> 1.0", only: :test},

# Spider toolbox dependencies
{:floki, "~> 0.32"}
]
end

Expand Down
2 changes: 2 additions & 0 deletions mix.lock
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
%{
"certifi": {:hex, :certifi, "2.8.0", "d4fb0a6bb20b7c9c3643e22507e42f356ac090a1dcea9ab99e27e0376d695eba", [:rebar3], [], "hexpm", "6ac7efc1c6f8600b08d625292d4bbf584e14847ce1b6b5c44d983d273e1097ea"},
"floki": {:hex, :floki, "0.32.1", "dfe3b8db3b793939c264e6f785bca01753d17318d144bd44b407fb3493acaa87", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "d4b91c713e4a784a3f7b1e3cc016eefc619f6b1c3898464222867cafd3c681a3"},
"hackney": {:hex, :hackney, "1.18.0", "c4443d960bb9fba6d01161d01cd81173089686717d9490e5d3606644c48d121f", [:rebar3], [{:certifi, "~>2.8.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "9afcda620704d720db8c6a3123e9848d09c87586dc1c10479c42627b905b5c5e"},
"html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"},
"httpoison": {:hex, :httpoison, "1.8.0", "6b85dea15820b7804ef607ff78406ab449dd78bed923a49c7160e1886e987a3d", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "28089eaa98cf90c66265b6b5ad87c59a3729bea2e74e9d08f9b51eb9729b3c3a"},
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
Expand Down
30 changes: 30 additions & 0 deletions test/spider_html_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
defmodule SpiderHtmlTest do
use ExUnit.Case

@urls [
"http://onigumo.local/hello.html",
"http://onigumo.local/bye.html"
]
@html ~s(<!doctype html>
<html>
<head>
<link href="/media/examples/link-element-example.css" rel="stylesheet">
</head>
<body>
<section id="content">
<p class="headline">Floki</p>
<a href="http://onigumo.local/hello.html">Hello</a>
<a href="http://onigumo.local/bye.html">Bye</a>
<a id="nothing"></a>
<span data-model="user">onigumo</span>
</section>
</body>
</html>)

describe("Spider.HTML.find_links/1") do
test("find links in href attributes of 'a' tags") do
links = Spider.HTML.find_links(@html)
assert links == @urls
end
end
end

0 comments on commit fc4455f

Please sign in to comment.