From 0c9da2cb35d752a4070fd0c8181663b1a6df0684 Mon Sep 17 00:00:00 2001 From: Glutexo Date: Sat, 22 Jan 2022 19:06:03 +0100 Subject: [PATCH 01/10] Hash the filename using MD5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Used an obsolete MD5 algorithm to produce file names from the URLs. The MD5 digest is nicely short and should be unique enough to suite our needs. There is no need for security here. Created a new Hash module to avoid introducing new dependencies like ExCrypto, which doesn’t support MD5 nevertheless. This Elixir module uses and encapsulates OTP :crypto module. Introduced parametrized tests using for loops on the outside aided by unquote to access values outside the test function in the module definition. --- lib/hash.ex | 22 ++++++++++++ lib/onigumo.ex | 8 +++-- test/hash_test.exs | 79 +++++++++++++++++++++++++++++++++++++++++++ test/onigumo_test.exs | 5 +-- 4 files changed, 110 insertions(+), 4 deletions(-) create mode 100644 lib/hash.ex create mode 100644 test/hash_test.exs diff --git a/lib/hash.ex b/lib/hash.ex new file mode 100644 index 0000000..58d803f --- /dev/null +++ b/lib/hash.ex @@ -0,0 +1,22 @@ +defmodule Hash do + def md5(data, fmt) do + hash(:md5, data) + |> format(fmt) + end + + def format(data, :binary) do + data + end + + def format(data, :hex) do + hex(data) + end + + def hex(data) do + Base.encode16(data, case: :lower) + end + + def hash(func, data) do + :crypto.hash(func, data) + end +end diff --git a/lib/onigumo.ex b/lib/onigumo.ex index 03d503b..c4b0280 100644 --- a/lib/onigumo.ex +++ b/lib/onigumo.ex @@ -3,7 +3,6 @@ defmodule Onigumo do Web scraper """ @input_filename "urls.txt" - @output_filename "body.html" def main() do HTTPoison.start() @@ -19,7 +18,8 @@ defmodule Onigumo do body: body } = http_client.get!(url) - File.write!(@output_filename, body) + filename(url) + |> File.write!(body) end def load_urls(filepath) do @@ -27,6 +27,10 @@ defmodule Onigumo do |> Enum.map(&String.trim_trailing/1) end + def filename(url) do + Hash.md5(url, :hex) + end + defp http_client() do Application.get_env(:onigumo, :http_client) end diff --git a/test/hash_test.exs b/test/hash_test.exs new file mode 100644 index 0000000..6434f33 --- /dev/null +++ b/test/hash_test.exs @@ -0,0 +1,79 @@ +defmodule HashTest do + use ExUnit.Case + + @known_md5s [ + { + "", + "d41d8cd98f00b204e9800998ecf8427e", + <<212, 29, 140, 217, 143, 0, 178, 4, 233, 128, 9, 152, 236, 248, + 66, 126>> + }, + { + "onigumo", + "3d8425b6ea2efe0fa78075492c719ffe", + <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, 73, 44, 113, + 159, 254>> + }, + { + "https://www.example.com/", + "dcbfe5ad9e8af3495ca4582e364c1bce", + <<220, 191, 229, 173, 158, 138, 243, 73, 92, 164, 88, 46, 54, 76, 27, 206>> + } + ] + + @binary_hash <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, 73, + 44, 113, 159, 254>> + @hexadecimal_hash "3d8425b6ea2efe0fa78075492c719ffe" + + @data "onigumo" + @known_hashes [ + { + :md5, + <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, 73, 44, 113, + 159, 254>> + }, + { + :sha256, + <<233, 125, 4, 183, 163, 127, 108, 247, 107, 107, 129, 176, 45, + 233, 210, 255, 218, 34, 202, 51, 112, 158, 160, 220, 15, 109, + 229, 143, 188, 196, 45, 128>> + }, + { + :sha512, + <<215, 171, 58, 63, 123, 94, 7, 206, 21, 30, 63, 150, 208, 35, + 179, 69, 235, 190, 128, 183, 0, 89, 237, 183, 155, 8, 190, 178, + 233, 240, 157, 95, 187, 200, 110, 163, 116, 55, 57, 63, 73, 16, + 192, 76, 15, 236, 126, 106, 117, 209, 199, 43, 231, 192, 105, + 122, 247, 100, 47, 100, 178, 231, 31, 217>> + } + ] + + for {data, hash_hex, hash_bin} <- @known_md5s do + test("MD5 known value #{inspect(data)} in hexadecimal") do + hash = Hash.md5(unquote(data), :hex) + assert(hash == unquote(hash_hex)) + end + + test("MD5 known value #{inspect(data)} in binary") do + actual_hash = Hash.md5(unquote(data), :binary) + assert(actual_hash == unquote(hash_bin)) + end + end + + test("format as binary") do + formatted = Hash.format(@binary_hash, :binary) + assert(formatted == @binary_hash) + end + + test("format as hexadecimal") do + formatted = Hash.format(@binary_hash, :hex) + assert(formatted == @hexadecimal_hash) + end + + for {func, hash} <- @known_hashes do + test("hash known value with #{func}") do + hash = Hash.hash(unquote(func), @data) + assert(hash == unquote(hash)) + end + end +end diff --git a/test/onigumo_test.exs b/test/onigumo_test.exs index c0bd0a6..7000849 100644 --- a/test/onigumo_test.exs +++ b/test/onigumo_test.exs @@ -3,7 +3,6 @@ defmodule OnigumoTest do import Mox @url "http://onigumo.org/hello.html" - @filename "body.html" @testfile_with_urls "urls.txt" setup(:verify_on_exit!) @@ -21,7 +20,9 @@ defmodule OnigumoTest do ) assert(:ok == Onigumo.download(HTTPoisonMock, @url)) - assert("Body from: #{@url}" == File.read!(@filename)) + + filename = Hash.md5(@url, :hex) + assert("Body from: #{@url}" == File.read!(filename)) end From 01fa73dfa0c99fff29b84ad4e12fd26833872fae Mon Sep 17 00:00:00 2001 From: Glutexo Date: Sat, 22 Jan 2022 19:17:19 +0100 Subject: [PATCH 02/10] Rename :binary to :bin For symmetry with :hex. --- lib/hash.ex | 2 +- test/hash_test.exs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/hash.ex b/lib/hash.ex index 58d803f..f90bfa8 100644 --- a/lib/hash.ex +++ b/lib/hash.ex @@ -4,7 +4,7 @@ defmodule Hash do |> format(fmt) end - def format(data, :binary) do + def format(data, :bin) do data end diff --git a/test/hash_test.exs b/test/hash_test.exs index 6434f33..59d2f91 100644 --- a/test/hash_test.exs +++ b/test/hash_test.exs @@ -55,13 +55,13 @@ defmodule HashTest do end test("MD5 known value #{inspect(data)} in binary") do - actual_hash = Hash.md5(unquote(data), :binary) + actual_hash = Hash.md5(unquote(data), :bin) assert(actual_hash == unquote(hash_bin)) end end test("format as binary") do - formatted = Hash.format(@binary_hash, :binary) + formatted = Hash.format(@binary_hash, :bin) assert(formatted == @binary_hash) end From 9ba4f5d7291c115de584b7b42e135ce8e3b318bd Mon Sep 17 00:00:00 2001 From: Glutexo Date: Sat, 22 Jan 2022 19:25:36 +0100 Subject: [PATCH 03/10] Parametrize format tests --- test/hash_test.exs | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/test/hash_test.exs b/test/hash_test.exs index 59d2f91..4fa2f92 100644 --- a/test/hash_test.exs +++ b/test/hash_test.exs @@ -21,9 +21,19 @@ defmodule HashTest do } ] - @binary_hash <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, 73, - 44, 113, 159, 254>> - @hexadecimal_hash "3d8425b6ea2efe0fa78075492c719ffe" + @binary_hash <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, + 73, 44, 113, 159, 254>> + @formatted_hashes [ + { + :bin, + <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, 73, 44, 113, + 159, 254>> + }, + { + :hex, + "3d8425b6ea2efe0fa78075492c719ffe" + } + ] @data "onigumo" @known_hashes [ @@ -60,16 +70,13 @@ defmodule HashTest do end end - test("format as binary") do - formatted = Hash.format(@binary_hash, :bin) - assert(formatted == @binary_hash) - end - - test("format as hexadecimal") do - formatted = Hash.format(@binary_hash, :hex) - assert(formatted == @hexadecimal_hash) + for {format, hash} <- @formatted_hashes do + test("format as #{format}") do + formatted = Hash.format(@binary_hash, unquote(format)) + assert(formatted == unquote(hash)) + end end - + for {func, hash} <- @known_hashes do test("hash known value with #{func}") do hash = Hash.hash(unquote(func), @data) From fa24bcf7e9639a73faa5a5970ec85d8d8acef73c Mon Sep 17 00:00:00 2001 From: Glutexo Date: Sat, 22 Jan 2022 19:26:22 +0100 Subject: [PATCH 04/10] Wrap long lines --- test/hash_test.exs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/hash_test.exs b/test/hash_test.exs index 4fa2f92..c011156 100644 --- a/test/hash_test.exs +++ b/test/hash_test.exs @@ -17,7 +17,8 @@ defmodule HashTest do { "https://www.example.com/", "dcbfe5ad9e8af3495ca4582e364c1bce", - <<220, 191, 229, 173, 158, 138, 243, 73, 92, 164, 88, 46, 54, 76, 27, 206>> + <<220, 191, 229, 173, 158, 138, 243, 73, 92, 164, 88, 46, 54, 76, + 27, 206>> } ] From fb13dd0c600765efdcc455d0fe324bb20badd1f4 Mon Sep 17 00:00:00 2001 From: Glutexo Date: Wed, 9 Mar 2022 15:56:43 +0100 Subject: [PATCH 05/10] Fix tests --- test/onigumo_test.exs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/onigumo_test.exs b/test/onigumo_test.exs index 6aa1104..a70d40a 100644 --- a/test/onigumo_test.exs +++ b/test/onigumo_test.exs @@ -17,7 +17,7 @@ defmodule OnigumoTest do download_result = Onigumo.download_url(input_url, tmp_dir) assert(download_result == :ok) - output_file_name = Base.url_encode64(input_url, padding: false) + output_file_name = Hash.md5(input_url, :hex) output_path = Path.join(tmp_dir, output_file_name) read_output = File.read!(output_path) expected_output = body(input_url) @@ -36,7 +36,7 @@ defmodule OnigumoTest do Onigumo.download_urls_from_file(tmp_dir) |> Stream.run() Enum.map(@urls, fn url -> - file_name = Base.url_encode64(url, padding: false) + file_name = Hash.md5(url, :hex) output_path = Path.join(tmp_dir, file_name) read_output = File.read!(output_path) expected_output = body(url) From d3e6d0bfce8c6aed5f75e227426ed68ebb5efa0e Mon Sep 17 00:00:00 2001 From: dstroch Date: Fri, 11 Mar 2022 19:29:33 +0100 Subject: [PATCH 06/10] Remove hex func At this moment extra func is not necessary. Add func when will be needed. Move encode16 to format func. --- lib/hash.ex | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/hash.ex b/lib/hash.ex index f90bfa8..b9ff321 100644 --- a/lib/hash.ex +++ b/lib/hash.ex @@ -9,10 +9,6 @@ defmodule Hash do end def format(data, :hex) do - hex(data) - end - - def hex(data) do Base.encode16(data, case: :lower) end From 1a06b6c47a1695163cdd61fcb4311853586831cd Mon Sep 17 00:00:00 2001 From: Glutexo Date: Sat, 26 Mar 2022 19:54:42 +0100 Subject: [PATCH 07/10] Make variable names consistent --- test/hash_test.exs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/hash_test.exs b/test/hash_test.exs index c011156..f371d4a 100644 --- a/test/hash_test.exs +++ b/test/hash_test.exs @@ -66,8 +66,8 @@ defmodule HashTest do end test("MD5 known value #{inspect(data)} in binary") do - actual_hash = Hash.md5(unquote(data), :bin) - assert(actual_hash == unquote(hash_bin)) + hash = Hash.md5(unquote(data), :bin) + assert(hash == unquote(hash_bin)) end end From cc6ec4ce944304f4f6d4f7df7af797e144885d58 Mon Sep 17 00:00:00 2001 From: Glutexo Date: Sat, 26 Mar 2022 20:51:41 +0100 Subject: [PATCH 08/10] Rename @data to something smarter --- test/hash_test.exs | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/test/hash_test.exs b/test/hash_test.exs index f371d4a..a87f311 100644 --- a/test/hash_test.exs +++ b/test/hash_test.exs @@ -36,7 +36,7 @@ defmodule HashTest do } ] - @data "onigumo" + @known_hash_data "onigumo" @known_hashes [ { :md5, @@ -59,29 +59,31 @@ defmodule HashTest do } ] - for {data, hash_hex, hash_bin} <- @known_md5s do - test("MD5 known value #{inspect(data)} in hexadecimal") do - hash = Hash.md5(unquote(data), :hex) - assert(hash == unquote(hash_hex)) + test("hash MD5 known value in hexadecimal") do + for {data, hash_hex, _} <- @known_md5s do + hash = Hash.md5(data, :hex) + assert(hash == hash_hex) end + end - test("MD5 known value #{inspect(data)} in binary") do - hash = Hash.md5(unquote(data), :bin) - assert(hash == unquote(hash_bin)) + test("hash MD5 known value in binary") do + for {data, _, hash_bin} <- @known_md5s do + hash = Hash.md5(data, :bin) + assert(hash == hash_bin) end end - for {format, hash} <- @formatted_hashes do - test("format as #{format}") do - formatted = Hash.format(@binary_hash, unquote(format)) - assert(formatted == unquote(hash)) + test("format a binary hash") do + for {format, hash} <- @formatted_hashes do + formatted = Hash.format(@binary_hash, format) + assert(formatted == hash) end end - for {func, hash} <- @known_hashes do - test("hash known value with #{func}") do - hash = Hash.hash(unquote(func), @data) - assert(hash == unquote(hash)) + test("hash a known value") do + for {func, known_hash} <- @known_hashes do + computed_hash = Hash.hash(func, @known_hash_data) + assert(computed_hash == known_hash) end end end From b79a6e24d83e2e3200666a6eca2870c37eb0acbc Mon Sep 17 00:00:00 2001 From: Glutexo Date: Sat, 26 Mar 2022 21:20:50 +0100 Subject: [PATCH 09/10] Hash filename using md5 --- lib/onigumo.ex | 4 ++-- test/onigumo_test.exs | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/lib/onigumo.ex b/lib/onigumo.ex index 0aee087..4f27b1f 100644 --- a/lib/onigumo.ex +++ b/lib/onigumo.ex @@ -19,7 +19,7 @@ defmodule Onigumo do end def download_url(url, root_path) do - file_name = Hash.md5(url, :hex) + file_name = create_file_name(url) file_path = Path.join(root_path, file_name) url @@ -56,6 +56,6 @@ defmodule Onigumo do end def create_file_name(url) do - Base.url_encode64(url, padding: false) + Hash.md5(url, :hex) end end diff --git a/test/onigumo_test.exs b/test/onigumo_test.exs index a78c0f9..f386411 100644 --- a/test/onigumo_test.exs +++ b/test/onigumo_test.exs @@ -17,7 +17,7 @@ defmodule OnigumoTest do download_result = Onigumo.download_url(input_url, tmp_dir) assert(download_result == :ok) - output_file_name = Hash.md5(input_url, :hex) + output_file_name = Onigumo.create_file_name(input_url) output_path = Path.join(tmp_dir, output_file_name) read_output = File.read!(output_path) expected_output = body(input_url) @@ -94,11 +94,8 @@ defmodule OnigumoTest do input_url = "https://onigumo.local/hello.html" created_file_name = Onigumo.create_file_name(input_url) - expected_file_name = Base.url_encode64(input_url, padding: false) + expected_file_name = Hash.md5(input_url, :hex) assert(created_file_name == expected_file_name) - - unexpected_file_name = Base.url_encode64(input_url, padding: true) - assert(created_file_name != unexpected_file_name) end defp prepare_response(url) do @@ -118,7 +115,7 @@ defmodule OnigumoTest do end defp assert_downloaded(url, tmp_dir) do - file_name = Hash.md5(url, :hex) + file_name = Onigumo.create_file_name(url) output_path = Path.join(tmp_dir, file_name) read_output = File.read!(output_path) expected_output = body(url) From 723f731fcea214710ee90b0be1ca049cda73a514 Mon Sep 17 00:00:00 2001 From: Glutexo Date: Sat, 26 Mar 2022 21:31:16 +0100 Subject: [PATCH 10/10] Mix format --- test/hash_test.exs | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/test/hash_test.exs b/test/hash_test.exs index a87f311..0af816b 100644 --- a/test/hash_test.exs +++ b/test/hash_test.exs @@ -5,30 +5,25 @@ defmodule HashTest do { "", "d41d8cd98f00b204e9800998ecf8427e", - <<212, 29, 140, 217, 143, 0, 178, 4, 233, 128, 9, 152, 236, 248, - 66, 126>> + <<212, 29, 140, 217, 143, 0, 178, 4, 233, 128, 9, 152, 236, 248, 66, 126>> }, { "onigumo", "3d8425b6ea2efe0fa78075492c719ffe", - <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, 73, 44, 113, - 159, 254>> + <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, 73, 44, 113, 159, 254>> }, { "https://www.example.com/", "dcbfe5ad9e8af3495ca4582e364c1bce", - <<220, 191, 229, 173, 158, 138, 243, 73, 92, 164, 88, 46, 54, 76, - 27, 206>> + <<220, 191, 229, 173, 158, 138, 243, 73, 92, 164, 88, 46, 54, 76, 27, 206>> } ] - @binary_hash <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, - 73, 44, 113, 159, 254>> + @binary_hash <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, 73, 44, 113, 159, 254>> @formatted_hashes [ { :bin, - <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, 73, 44, 113, - 159, 254>> + <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, 73, 44, 113, 159, 254>> }, { :hex, @@ -40,22 +35,19 @@ defmodule HashTest do @known_hashes [ { :md5, - <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, 73, 44, 113, - 159, 254>> + <<61, 132, 37, 182, 234, 46, 254, 15, 167, 128, 117, 73, 44, 113, 159, 254>> }, { :sha256, - <<233, 125, 4, 183, 163, 127, 108, 247, 107, 107, 129, 176, 45, - 233, 210, 255, 218, 34, 202, 51, 112, 158, 160, 220, 15, 109, - 229, 143, 188, 196, 45, 128>> + <<233, 125, 4, 183, 163, 127, 108, 247, 107, 107, 129, 176, 45, 233, 210, 255, 218, 34, 202, + 51, 112, 158, 160, 220, 15, 109, 229, 143, 188, 196, 45, 128>> }, { :sha512, - <<215, 171, 58, 63, 123, 94, 7, 206, 21, 30, 63, 150, 208, 35, - 179, 69, 235, 190, 128, 183, 0, 89, 237, 183, 155, 8, 190, 178, - 233, 240, 157, 95, 187, 200, 110, 163, 116, 55, 57, 63, 73, 16, - 192, 76, 15, 236, 126, 106, 117, 209, 199, 43, 231, 192, 105, - 122, 247, 100, 47, 100, 178, 231, 31, 217>> + <<215, 171, 58, 63, 123, 94, 7, 206, 21, 30, 63, 150, 208, 35, 179, 69, 235, 190, 128, 183, + 0, 89, 237, 183, 155, 8, 190, 178, 233, 240, 157, 95, 187, 200, 110, 163, 116, 55, 57, 63, + 73, 16, 192, 76, 15, 236, 126, 106, 117, 209, 199, 43, 231, 192, 105, 122, 247, 100, 47, + 100, 178, 231, 31, 217>> } ] @@ -79,7 +71,7 @@ defmodule HashTest do assert(formatted == hash) end end - + test("hash a known value") do for {func, known_hash} <- @known_hashes do computed_hash = Hash.hash(func, @known_hash_data)