Skip to content

Commit

Permalink
Improved compilation time
Browse files Browse the repository at this point in the history
  • Loading branch information
dannote committed Feb 10, 2021
1 parent b5a477e commit 9695d60
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 111 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Add `tongue` to your list of dependencies in `mix.exs`:

```elixir
def deps do
[{:tongue, "~> 1.0.2"}]
[{:tongue, "~> 2.0"}]
end
```

Expand All @@ -28,9 +28,10 @@ iex> Tongue.detect("The octopus is a soft-bodied, eight-armed mollusc of the ord
Detect language within subset of supported languages:

```elixir
iex> subset = Tongue.subset(~w(ru en es fr)a)
iex> Tongue.detect("El microprocesador (o simplemente procesador) es el circuito integrado central más complejo de un sistema informático; a modo de ilustración, se le suele llamar por analogía el «cerebro» de un ordenador.", subset)
[es: 0.9999977345738683]
use Mix.Config

config :tongue,
languages: ~w(en ru fr de)a
```

## Languages
Expand Down
52 changes: 6 additions & 46 deletions lib/tongue.ex
Original file line number Diff line number Diff line change
Expand Up @@ -8,33 +8,25 @@ defmodule Tongue do
[en: 0.9999986358008764]
"""

alias Tongue.{Data, Detector}
alias Tongue.Detector

@doc """
Detects a language. Returns a keyword of scored languages.
## Parameters
- `text` - a text string
- `languages` - a tuple generated by `Tongue.subset/1`
## Examples
iex> Tongue.detect("El microprocesador (o simplemente procesador) es el circuito integrado central más complejo de un sistema informático; a modo de ilustración, se le suele llamar por analogía el «cerebro» de un ordenador.", Tongue.subset(~w(en es fr)a))
iex> Tongue.detect("El microprocesador (o simplemente procesador) es el circuito integrado central más complejo de un sistema informático; a modo de ilustración, se le suele llamar por analogía el «cerebro» de un ordenador.")
[es: 0.9999977345738683]
"""

@spec detect(String.t(), tuple()) :: keyword(char())
def detect(text, languages \\ nil)

def detect(text, nil) do
Detector.detect(text, Data.languages(), Data.ngram_frequencies())
end

def detect(text, selected_languages) do
{languages, ngram_frequencies} = selected_languages
Detector.detect(text, languages, ngram_frequencies)
@spec detect(String.t()) :: keyword(char())
def detect(text) do
Detector.detect(text)
end

@doc """
Expand All @@ -43,38 +35,6 @@ defmodule Tongue do

@spec languages() :: list(atom)
def languages do
Data.languages()
end

@doc """
Strips built-in dataset to selected languages
## Parameters
- `languages` - a list of languages you would like to detect
## Examples
iex> subset = Tongue.subset(~w(ru en es fr)a)
iex> Tongue.detect("Le puits du Magny est l'un des principaux puits des houillères de Ronchamp, situé sur le territoire de la commune de Magny-Danigon", subset)
[fr: 0.9999968121112444]
"""

@spec subset(list(atom)) :: tuple()
def subset(languages) do
builtin_languages = Data.languages()

ngram_frequencies = Enum.into(Data.ngram_frequencies(), %{}, fn {ngram, frequencies} ->
{_, frequencies} =
builtin_languages
|> Enum.zip(frequencies)
|> Enum.filter(fn {language, _} -> language in languages end)
|> Enum.unzip

{ngram, frequencies}
end)

{Enum.sort(languages), ngram_frequencies}
Detector.languages()
end
end
13 changes: 13 additions & 0 deletions lib/tongue/app.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
defmodule Tongue.App do
@moduledoc false

use Application

def start(_, _) do
children = [
Tongue.Detector
]

Supervisor.start_link(children, strategy: :one_for_one, name: __MODULE__)
end
end
29 changes: 0 additions & 29 deletions lib/tongue/data.ex

This file was deleted.

97 changes: 80 additions & 17 deletions lib/tongue/detector.ex
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
defmodule Tongue.Detector do
@moduledoc false

alias Tongue.Data
use GenServer

@n_gram 3
@n_trial 7
Expand All @@ -12,14 +12,27 @@ defmodule Tongue.Detector do
@convolution_threshold 0.99999
@base_frequency 10_000

@latin1_excluded Data.messages("NGram.LATIN1_EXCLUDE")
@messages "priv/messages.binary"
|> File.read!()
|> :erlang.binary_to_term()

@blocks "priv/unicode_blocks.binary"
|> File.read!()
|> :erlang.binary_to_term()

@builtin_languages "priv/profiles.binary"
|> File.read!()
|> :erlang.binary_to_term()
|> Map.get(:languages)

@latin1_excluded @messages["NGram.LATIN1_EXCLUDE"]

@normalized_vi_chars ~w(NORMALIZED_VI_CHARS_0300 NORMALIZED_VI_CHARS_0301 NORMALIZED_VI_CHARS_0303
NORMALIZED_VI_CHARS_0309 NORMALIZED_VI_CHARS_0323)
|> Enum.map(&Data.messages/1)
|> Enum.map(&@messages[&1])

@to_normalize_chars Data.messages("TO_NORMALIZE_VI_CHARS")
@dmark_class Data.messages("DMARK_CLASS")
@to_normalize_chars @messages["TO_NORMALIZE_VI_CHARS"]
@dmark_class @messages["DMARK_CLASS"]

# CJK Kanji Normalization mapping

Expand Down Expand Up @@ -50,25 +63,75 @@ defmodule Tongue.Detector do
NGram.KANJI_7_28 NGram.KANJI_7_29 NGram.KANJI_7_32 NGram.KANJI_7_33 NGram.KANJI_7_35
NGram.KANJI_7_37)
|> Enum.flat_map(fn key ->
message = Data.messages(key)
message = @messages[key]
representative = List.first(message)
Enum.map(message, &{&1, representative})
end)
|> Map.new()

def detect(text, languages, ngram_frequencies) do
def start_link(_) do
GenServer.start_link(__MODULE__, :ok, name: __MODULE__)
end

def init(_) do
profiles =
:tongue
|> Application.app_dir("priv/profiles.binary")
|> File.read!()
|> :erlang.binary_to_term()
|> Map.get(:ngrams_frequencies)
|> subset(Application.get_env(:tongue, :languages))

{:ok, profiles}
end

def detect(text) do
GenServer.call(__MODULE__, {:detect, text})
end

def languages() do
GenServer.call(__MODULE__, :languages)
end

def handle_call(:languages, _from, {languages, ngram_frequencies}) do
{:reply, languages, {languages, ngram_frequencies}}
end

def handle_call({:detect, text}, _from, {languages, ngram_frequencies}) do
# Cleaning text to detect
# (eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet).

text
|> String.replace(~r(https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}), " ")
|> String.replace(~r([-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}), " ")
|> String.to_charlist()
|> clean
|> normalize
|> extract_ngrams(ngram_frequencies)
|> calculate_probabilities(languages, ngram_frequencies)
|> sort_probabilities(languages)
probabilities =
text
|> String.replace(~r(https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}), " ")
|> String.replace(~r([-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}), " ")
|> String.to_charlist()
|> clean
|> normalize
|> extract_ngrams(ngram_frequencies)
|> calculate_probabilities(languages, ngram_frequencies)
|> sort_probabilities(languages)

{:reply, probabilities, {languages, ngram_frequencies}}
end

def subset(ngram_frequencies, languages) when is_nil(languages) do
{@builtin_languages, ngram_frequencies}
end

def subset(ngram_frequencies, languages) do
new_ngram_frequencies =
Enum.into(ngram_frequencies, %{}, fn {ngram, frequencies} ->
{_, frequencies} =
@builtin_languages
|> Enum.zip(frequencies)
|> Enum.filter(fn {language, _} -> language in languages end)
|> Enum.unzip

{ngram, frequencies}
end)

{Enum.sort(languages), new_ngram_frequencies}
end

def clean(text) do
Expand Down Expand Up @@ -271,7 +334,7 @@ defmodule Tongue.Detector do
|> Enum.filter(fn {_, probability} -> probability > @probability_threshold end)
end

Enum.map(Data.blocks(), fn {from, to, block} ->
Enum.map(@blocks, fn {from, to, block} ->
def unicode_block(char) when char in unquote(from)..unquote(to) do
unquote(block)
end
Expand Down
11 changes: 6 additions & 5 deletions mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ defmodule Tongue.MixProject do
def project do
[
app: :tongue,
version: "1.0.2",
version: "2.0.0",
elixir: "~> 1.7",
start_permanent: Mix.env() == :prod,
deps: deps(),
Expand All @@ -17,16 +17,17 @@ defmodule Tongue.MixProject do

def application do
[
extra_applications: [:logger, :poison]
extra_applications: [:logger, :poison],
mod: {Tongue.App, []}
]
end

defp deps do
[
{:poison, "~> 3.1", only: [:dev, :test]},
{:dialyxir, "~> 1.0.0-rc.3", only: [:dev, :test], runtime: false},
{:credo, "~> 0.10.0", only: [:dev, :test], runtime: false},
{:ex_doc, "~> 0.19.0", only: :dev, runtime: false}
{:dialyxir, "~> 1.0", only: [:dev, :test], runtime: false},
{:credo, "~> 1.5", only: [:dev, :test], runtime: false},
{:ex_doc, "~> 0.22.0", only: :dev, runtime: false}
]
end

Expand Down
23 changes: 13 additions & 10 deletions mix.lock
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
%{
"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [], [], "hexpm"},
"credo": {:hex, :credo, "0.10.0", "66234a95effaf9067edb19fc5d0cd5c6b461ad841baac42467afed96c78e5e9e", [], [{:bunt, "~> 0.2.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm"},
"dialyxir": {:hex, :dialyxir, "1.0.0-rc.3", "774306f84973fc3f1e2e8743eeaa5f5d29b117f3916e5de74c075c02f1b8ef55", [], [], "hexpm"},
"earmark": {:hex, :earmark, "1.2.5", "4d21980d5d2862a2e13ec3c49ad9ad783ffc7ca5769cf6ff891a4553fbaae761", [], [], "hexpm"},
"ex_doc": {:hex, :ex_doc, "0.19.0", "e22b6434373b4870ea77b24df069dbac7002c1f483615e9ebfc0c37497e1c75c", [:mix], [{:earmark, "~> 1.1", [hex: :earmark, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.7", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm"},
"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], [], "hexpm", "7af5c7e09fe1d40f76c8e4f9dd2be7cebd83909f31fee7cd0e9eadc567da8353"},
"credo": {:hex, :credo, "1.5.5", "e8f422026f553bc3bebb81c8e8bf1932f498ca03339856c7fec63d3faac8424b", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2.8", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "dd8623ab7091956a855dc9f3062486add9c52d310dfd62748779c4315d8247de"},
"dialyxir": {:hex, :dialyxir, "1.0.0", "6a1fa629f7881a9f5aaf3a78f094b2a51a0357c843871b8bc98824e7342d00a5", [:mix], [{:erlex, ">= 0.2.6", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "aeb06588145fac14ca08d8061a142d52753dbc2cf7f0d00fc1013f53f8654654"},
"earmark": {:hex, :earmark, "1.2.5", "4d21980d5d2862a2e13ec3c49ad9ad783ffc7ca5769cf6ff891a4553fbaae761", [:mix], [], "hexpm", "c57508ddad47dfb8038ca6de1e616e66e9b87313220ac5d9817bc4a4dc2257b9"},
"earmark_parser": {:hex, :earmark_parser, "1.4.12", "b245e875ec0a311a342320da0551da407d9d2b65d98f7a9597ae078615af3449", [:mix], [], "hexpm", "711e2cc4d64abb7d566d43f54b78f7dc129308a63bc103fbd88550d2174b3160"},
"erlex": {:hex, :erlex, "0.2.6", "c7987d15e899c7a2f34f5420d2a2ea0d659682c06ac607572df55a43753aa12e", [:mix], [], "hexpm", "2ed2e25711feb44d52b17d2780eabf998452f6efda104877a3881c2f8c0c0c75"},
"ex_doc": {:hex, :ex_doc, "0.22.6", "0fb1e09a3e8b69af0ae94c8b4e4df36995d8c88d5ec7dbd35617929144b62c00", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm", "1e0aceda15faf71f1b0983165e6e7313be628a460e22a031e32913b98edbd638"},
"exprintf": {:hex, :exprintf, "0.2.1", "b7e895dfb00520cfb7fc1671303b63b37dc3897c59be7cbf1ae62f766a8a0314", [], [], "hexpm"},
"exprof": {:hex, :exprof, "0.2.3", "8d4d657d73fc0c9ef1e30b2f9207b26ccbd2aec2baf1ca43f0b6d244c841c9f8", [], [{:exprintf, "~> 0.2", [hex: :exprintf, repo: "hexpm", optional: false]}], "hexpm"},
"jason": {:hex, :jason, "1.1.1", "d3ccb840dfb06f2f90a6d335b536dd074db748b3e7f5b11ab61d239506585eb2", [], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm"},
"makeup": {:hex, :makeup, "0.5.1", "966c5c2296da272d42f1de178c1d135e432662eca795d6dc12e5e8787514edf7", [], [{:nimble_parsec, "~> 0.2.2", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm"},
"makeup_elixir": {:hex, :makeup_elixir, "0.8.0", "1204a2f5b4f181775a0e456154830524cf2207cf4f9112215c05e0b76e4eca8b", [], [{:makeup, "~> 0.5.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 0.2.2", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm"},
"nimble_parsec": {:hex, :nimble_parsec, "0.2.2", "d526b23bdceb04c7ad15b33c57c4526bf5f50aaa70c7c141b4b4624555c68259", [], [], "hexpm"},
"poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [], [], "hexpm"},
"file_system": {:hex, :file_system, "0.2.10", "fb082005a9cd1711c05b5248710f8826b02d7d1784e7c3451f9c1231d4fc162d", [:mix], [], "hexpm", "41195edbfb562a593726eda3b3e8b103a309b733ad25f3d642ba49696bf715dc"},
"jason": {:hex, :jason, "1.2.2", "ba43e3f2709fd1aa1dce90aaabfd039d000469c05c56f0b8e31978e03fa39052", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "18a228f5f0058ee183f29f9eae0805c6e59d61c3b006760668d8d18ff0d12179"},
"makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"},
"makeup_elixir": {:hex, :makeup_elixir, "0.15.1", "b5888c880d17d1cc3e598f05cdb5b5a91b7b17ac4eaf5f297cb697663a1094dd", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "db68c173234b07ab2a07f645a5acdc117b9f99d69ebf521821d89690ae6c6ec8"},
"nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"},
"poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm", "fec8660eb7733ee4117b85f55799fd3833eb769a6df71ccf8903e8dc5447cfce"},
"unicode": {:hex, :unicode, "0.0.1", "944687cd531a8994daf116161d0d38591522ab128b692bef70a52595e4488a69", [], [{:earmark, ">= 0.2.1", [hex: :earmark, repo: "hexpm", optional: false]}], "hexpm"},
}

0 comments on commit 9695d60

Please sign in to comment.