diff --git a/src/os2datascanner/engine2/rules/experimental/__init__.py b/src/os2datascanner/engine2/rules/experimental/__init__.py
index e14e757c5..ce7a14669 100644
--- a/src/os2datascanner/engine2/rules/experimental/__init__.py
+++ b/src/os2datascanner/engine2/rules/experimental/__init__.py
@@ -1,2 +1,3 @@
from . import cpr # noqa
from . import health_rule # noqa
+from . import credit_card # noqa
\ No newline at end of file
diff --git a/src/os2datascanner/engine2/rules/experimental/credit_card.py b/src/os2datascanner/engine2/rules/experimental/credit_card.py
new file mode 100644
index 000000000..add4c5d7b
--- /dev/null
+++ b/src/os2datascanner/engine2/rules/experimental/credit_card.py
@@ -0,0 +1,37 @@
+import re
+from ..rule import SimpleRule
+from ...conversions.types import OutputType
+
+def luhn_algorithm(num: str):
+ """Computes the Luhn check digit for a given string of digits. (The last
+ digit of virtually all credit and debit card numbers is a Luhn check
+ digit.)"""
+ double = (len(num) % 2 == 1)
+ tot = 0
+ for ch in num:
+ v = int(ch) * (2 if double else 1)
+ tot += sum(int(c) for c in str(v))
+ double = not double
+ return 10 - (tot % 10)
+
+
+class CreditCardRule(SimpleRule):
+ operates_on = OutputType.Text
+
+ def __init__(self):
+ self._expr = re.compile(
+ r"[0-9]{4}([- ]?[0-9]{4}){3}")
+
+ def match(self, representation: str):
+ for mo in self._expr.finditer(representation):
+ # Canonicalise the joiners away
+ num = "".join(ch for ch in mo.group() if ch.isdigit())
+
+ # See if the check digit is what we expect
+ if str(luhn_algorithm(num[:-1])) == num[-1]:
+ yield {
+ "match": num
+ }
+
+ def to_json_object(self):
+ return super().to_json_object()
diff --git a/src/os2datascanner/projects/admin/adminapp/templates/miniscan.html b/src/os2datascanner/projects/admin/adminapp/templates/miniscan.html
index 36c91fbde..53d7f6227 100644
--- a/src/os2datascanner/projects/admin/adminapp/templates/miniscan.html
+++ b/src/os2datascanner/projects/admin/adminapp/templates/miniscan.html
@@ -7,9 +7,10 @@
{% block scripts %}
{{ block.super }}
-
-
-
+
+
+
+
{% endblock %}
{% block body %}
@@ -58,6 +59,7 @@
{% trans "File to scan" %}
type="button"
onclick="clearFile()">
{% trans "Clear file" %}
+
@@ -70,9 +72,21 @@
{% trans "File to scan" %}
{% endwith %}
{% trans "Enter your text below" %}
-
-
+
+
+
+
+ {% trans "Enable classification (slower)" %}
+
{% trans "Rule to execute" %}
diff --git a/src/os2datascanner/projects/admin/adminapp/views/miniscanner_views.py b/src/os2datascanner/projects/admin/adminapp/views/miniscanner_views.py
index 4e70df41a..97a698fec 100644
--- a/src/os2datascanner/projects/admin/adminapp/views/miniscanner_views.py
+++ b/src/os2datascanner/projects/admin/adminapp/views/miniscanner_views.py
@@ -36,7 +36,6 @@ def get_context_data(self):
return context
-
def mini_scan(scan_item, rule):
"""
This function will take a scanItem arg as well as a rule arg. It checks
@@ -102,14 +101,7 @@ def mini_scan(scan_item, rule):
item_name=item_name)
-def execute_mini_scan(request):
- """Gets context (item to be scanned, rules to scan for) and performs a scan
- on the item (file or raw text) recieved. Will cause an internal server
- error (500 error code) if the scan rule does not get sent. This happens
- when the user is not logged in / gets logged out for inactivity. However
- this is only backend side and it does not cause any trouble on the
- website.
- """
+def execute_mini_scan(request): # noqa:CCR001
context = {
"file_obj": (file_obj := request.FILES.get("file")),
"text": (text := request.POST.get("text")),
@@ -130,4 +122,4 @@ def execute_mini_scan(request):
for m in mini_scan(text, rule):
replies.append(m)
- return render(request, "components/miniscanner/miniscan_results.html", context)
+ return render(request, "components/miniscanner/miniscan_results.html", context)
\ No newline at end of file
diff --git a/src/os2datascanner/projects/admin/locale/da/LC_MESSAGES/django.po b/src/os2datascanner/projects/admin/locale/da/LC_MESSAGES/django.po
index e431cfea9..3fa821740 100644
--- a/src/os2datascanner/projects/admin/locale/da/LC_MESSAGES/django.po
+++ b/src/os2datascanner/projects/admin/locale/da/LC_MESSAGES/django.po
@@ -9,7 +9,7 @@ msgid ""
msgstr ""
"Project-Id-Version: \n"
"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2024-07-11 14:53+0200\n"
+"POT-Creation-Date: 2024-07-03 14:44+0200\n"
"PO-Revision-Date: 2021-08-07 09:10+0200\n"
"Last-Translator: Magenta Aps\n"
"Language-Team: 2. sal\n"
@@ -2005,78 +2005,74 @@ msgstr ""
"enkel fil eller tekst-stykke. "
"Resultaterne vises længere nede på denne side."
-#: adminapp/templates/miniscan.html:31 adminapp/templates/miniscan.html:66
+#: adminapp/templates/miniscan.html:30 adminapp/templates/miniscan.html:60
msgid "Maximum allowed file size is: "
msgstr "Den maksimalt tilladte filstørrelse er: "
-#: adminapp/templates/miniscan.html:45
+#: adminapp/templates/miniscan.html:44
msgid "File to scan"
msgstr "Upload fil"
-#: adminapp/templates/miniscan.html:56 adminapp/templates/miniscan.html:57
-msgid "Clear selected file"
-msgstr "Ryd valgt fil"
-
-#: adminapp/templates/miniscan.html:60
+#: adminapp/templates/miniscan.html:54
msgid "Clear file"
msgstr "Ryd fil"
-#: adminapp/templates/miniscan.html:64
+#: adminapp/templates/miniscan.html:58
msgid "The selected file is too big!"
msgstr "Den valgte fil er for stor!"
-#: adminapp/templates/miniscan.html:68
+#: adminapp/templates/miniscan.html:62
msgid "Please, select another (smaller) file to scan."
msgstr "Vælg venligst at scanne en anden (mindre) fil."
-#: adminapp/templates/miniscan.html:72
+#: adminapp/templates/miniscan.html:66
msgid "Enter your text below"
-msgstr "Tast venligst text nedenfor"
+msgstr ""
-#: adminapp/templates/miniscan.html:77
+#: adminapp/templates/miniscan.html:72
msgid "Rule to execute"
msgstr "Definér regel"
-#: adminapp/templates/miniscan.html:79
+#: adminapp/templates/miniscan.html:74
msgid "Selected rule: "
msgstr "Valgte regel: "
-#: adminapp/templates/miniscan.html:80
+#: adminapp/templates/miniscan.html:75
msgid "None"
msgstr "Ingen"
-#: adminapp/templates/miniscan.html:86 adminapp/templates/miniscan.html:87
-#: adminapp/templates/miniscan.html:120
+#: adminapp/templates/miniscan.html:81 adminapp/templates/miniscan.html:82
+#: adminapp/templates/miniscan.html:115
msgid "Upload a rule from the list"
msgstr "Indlæs en regel fra listen"
-#: adminapp/templates/miniscan.html:87
+#: adminapp/templates/miniscan.html:82
msgid "Load rule"
msgstr "Indlæs regel"
-#: adminapp/templates/miniscan.html:93 adminapp/templates/miniscan.html:94
-#: adminapp/templates/miniscan.html:95
+#: adminapp/templates/miniscan.html:88 adminapp/templates/miniscan.html:89
+#: adminapp/templates/miniscan.html:90
msgid "Run"
msgstr "Kør"
-#: adminapp/templates/miniscan.html:93 adminapp/templates/miniscan.html:94
+#: adminapp/templates/miniscan.html:88 adminapp/templates/miniscan.html:89
msgid "scanner"
msgstr "scanner"
-#: adminapp/templates/miniscan.html:98 adminapp/templates/miniscan.html:99
-#: adminapp/templates/miniscan.html:100
+#: adminapp/templates/miniscan.html:93 adminapp/templates/miniscan.html:94
+#: adminapp/templates/miniscan.html:95
msgid "Clear"
msgstr "Ryd"
-#: adminapp/templates/miniscan.html:98 adminapp/templates/miniscan.html:99
+#: adminapp/templates/miniscan.html:93 adminapp/templates/miniscan.html:94
msgid "content"
msgstr "indhold"
-#: adminapp/templates/miniscan.html:103
+#: adminapp/templates/miniscan.html:98
msgid "Scan result"
msgstr "Scanresultat"
-#: adminapp/templates/miniscan.html:104
+#: adminapp/templates/miniscan.html:99
msgid "(none yet)"
msgstr "(endnu ingen)"
@@ -3388,6 +3384,9 @@ msgstr "Navnet er allerede i brug."
msgid "Account not found."
msgstr "Konto ikke fundet."
+#~ msgid "Write here"
+#~ msgstr "Skriv her"
+
#~ msgid "grants"
#~ msgstr "bevillinger"
diff --git a/src/os2datascanner/projects/static/js/scannerjob/clearFile.js b/src/os2datascanner/projects/static/js/scannerjob/clearFile.js
new file mode 100644
index 000000000..9280bc577
--- /dev/null
+++ b/src/os2datascanner/projects/static/js/scannerjob/clearFile.js
@@ -0,0 +1,7 @@
+// Clears the selected file in the miniscanner file selection.
+// Implemented for the user to be able to scan only text.
+
+function clearFile() {
+ let inputField = document.getElementById("upload-file");
+ inputField.value = "";
+}
\ No newline at end of file
diff --git a/src/os2datascanner/projects/static/scss/components/_forms.scss b/src/os2datascanner/projects/static/scss/components/_forms.scss
index ff2ae290c..12c807066 100644
--- a/src/os2datascanner/projects/static/scss/components/_forms.scss
+++ b/src/os2datascanner/projects/static/scss/components/_forms.scss
@@ -563,7 +563,7 @@
#file-clear-btn {
position: absolute;
width: 5%;
- height: 46px;
+ height: 8.5%;
margin-left: 20px;
}
diff --git a/src/os2datascanner/utils/optimiser/optimiser.py b/src/os2datascanner/utils/optimiser/optimiser.py
new file mode 100644
index 000000000..6d3e9fe5c
--- /dev/null
+++ b/src/os2datascanner/utils/optimiser/optimiser.py
@@ -0,0 +1,310 @@
+import json
+
+rule_ids = {
+ 1:'{"type":"cpr","modulus_11":true,"ignore_irrelevant":true,"examine_context":true,"exceptions":""}',
+ 2: '{"type":"address","whitelist":[],"blacklist":[]}',
+ 3: '{"type":"name","whitelist":[],"blacklist":[],"expansive":false}',
+ 4: '{"type":"name","whitelist":[],"blacklist":[],"expansive":true}',
+ 5: '{"type":"ordered-wordlist","dataset":"da_20211018_laegehaandbog_stikord"}',
+ 6: '{"type":"passport"}'
+}
+
+class RuleOptimiser():
+ def __init__(self, default_input_path: str, default_output_path: str) -> None:
+ self.IN_PATH = default_input_path
+ self.OUT_PATH = default_output_path
+ self.containers: list['CustomContainer'] = []
+ self.found_redundancy: bool = True
+ self.cycles: int = 0
+ self.setup()
+
+ @property
+ def clean_rule(self):
+ return load_json(self.OUT_PATH)
+
+ def setup(self):
+ """Copies the original rule into
+ another file where it is cleaned"""
+ obj = load_json(self.IN_PATH, extensive=True)
+ dump_json(obj, self.OUT_PATH)
+
+ def containify(self, container: 'CustomContainer'):
+ """Transforms all {"type": "and" | "or", "components": [...]}
+ into the CustomContainer class, so they become easier to
+ handle in python, rather than dictionaries. Not k becomes -k.
+
+ *JSON to CustomContainer*"""
+
+ for k in container.components:
+ if isinstance(k, dict):
+ if k["type"] == "not":
+ container.components = arr_switch(arr=container.components, deletion=k, insertion=-k["rule"])
+ else:
+ new = CustomContainer(k["type"], k["components"], container)
+ container.components = arr_switch(arr=container.components, deletion=k, insertion=new)
+ self.containers.append(container.components[-1])
+ yield container.components[-1]
+
+
+ def get_containers(self, main: 'CustomContainer'):
+ """Gets a list of all the containers.
+ Repeats itself with containify, but
+ otherwise the code wouldn't work."""
+ found = [main]
+ next_up: list[CustomContainer] = [main]
+ while next_up:
+ for cont in next_up:
+ for comp in cont.components:
+ if isinstance(comp, dict):
+ new = CustomContainer(comp["type"], comp["components"], cont)
+ found.append(new)
+ cont.components.remove(comp)
+ cont.components.append(new)
+ self.containers.append(new) # [new piece of code]
+ elif isinstance(comp, CustomContainer):
+ found.append(comp)
+ next_up.remove(cont)
+ return found
+
+
+ def check_empty(self, container:'CustomContainer'):
+ # Checks if container has no components i.e. AND(null) | OR(null)
+ if not container.components:
+ try:
+ container.parent.components.remove(container)
+ self.containers.remove(container)
+ self.found_redundancy = True
+ except ValueError:
+ pass
+ return True
+ return False
+
+
+ def check_useless(self, container:'CustomContainer'):
+ # Useless i.e. AND(1), OR(1), etc ...
+ if len(container.components) == 1:
+ try:
+ container.parent.components = arr_switch(container.parent.components, container, container.components[0])
+ self.containers.remove(container)
+ self.found_redundancy = True
+ except ValueError:
+ pass
+ return True
+ return False
+
+
+ def check_symbol_redundancy(self, container:'CustomContainer'):
+ # Checks for AND(n, -n) | OR(n, -n)
+ mem = []
+ for k in container.components:
+ mem.append(k)
+ if isinstance(k, int):
+ if -k in mem:
+ container.parent.components.remove(container)
+ self.containers.remove(container)
+ self.found_redundancy = True
+ return True
+ return False
+
+ def optimisation_cycle(self):
+ """Does all the checks on the loaded rule as cycles.
+ If a cycle completed, a new one starts if the previous
+ one found anything. If not, main() ends."""
+ rule = load_json(self.OUT_PATH)
+
+ main = CustomContainer(operator=rule["type"], components=rule["components"])
+ main.parent = main
+
+ self.found_redundancy = True
+ self.cycles = 0
+
+ while self.found_redundancy:
+ self.cycles += 1
+ self.found_redundancy = False
+
+ future = [main]
+ self.containers = [main]
+
+
+ while future:
+ for k in future:
+ future.remove(k)
+ future.extend(list(self.containify(k)))
+
+ self.containers.extend(self.get_containers(main))
+ self.containers = list(remove_duplicates(self.containers))
+
+ for cont in self.containers:
+ if isinstance(cont, CustomContainer):
+
+ unique_components = list(remove_duplicates(cont.components.copy()))
+ if not list_equality_no_order(cont.components.copy(), unique_components):
+ cont.components = unique_components
+ self.found_redundancy = True
+
+ duplicates = list(find_duplicates(cont.components, cont.parent.components))
+
+ if duplicates and cont != main:
+ self.found_redundancy = True
+
+ duplicates = list(find_duplicates(cont.components, cont.parent.components))
+
+ if cont.operator == "and" and cont.parent.operator == "or":
+ cont.parent.components = inverse_extend(cont.parent.components, duplicates)
+ else:
+ cont.components = inverse_extend(cont.components, duplicates)
+
+ # On redundancy, always remove duplicate element from child container
+ # except in the case where parent is OR and child is AND.
+
+ # a OR b OR (b AND c) != a OR b OR (c AND True) [-]
+ # a OR b OR (b AND c) == a OR (b AND c) [+]
+
+ if cont.operator == cont.parent.operator and cont != main:
+ try:
+ cont.parent.components.remove(cont)
+ self.containers.remove(cont)
+ cont.parent.components.extend(cont.components)
+ continue
+ except ValueError:
+ pass
+ if self.check_empty(cont) or self.check_useless(cont) or self.check_symbol_redundancy(cont):
+ continue
+
+ dump_json(main.as_dict(), self.OUT_PATH)
+ return self.cycles > 1 # If it did more than 1 cycle, it means it found something
+
+ def reformat_rule(self, path):
+ obj = str(load_json(path))
+ for id in rule_ids:
+ obj = obj.replace(str(id), rule_ids[id])
+ obj = obj.replace("'", '"') # Fixing some double | single quote problems
+ obj = json.loads(obj)
+ with open(self.OUT_PATH, "wt") as file:
+ json.dump(obj, file)
+
+ def run_optimiser(self):
+ refactored = True
+ while refactored:
+ refactored = self.optimisation_cycle()
+ # Get previous cycles.
+ # While previous optimisation_cycle() found stuff, run it again.
+ # Maybe check for risk of infinite recursion
+ self.reformat_rule(self.OUT_PATH)
+
+
+class CustomContainer():
+ """CustomContainer class for being able to work
+ with the rule dictionairies easier.
+ A container is a recognised by this form :
+ {"type": "and" | "or", "components": [...]}"""
+
+ def __init__(self, operator: str, components: list, parent=None) -> None:
+ self.operator = operator
+ self.components = components
+ self.parent:'CustomContainer' = parent
+
+ if self.operator not in ["and", "or"]:
+ raise ValueError(f"Invalid operator[op={self.operator}]")
+
+ def as_dict(self):
+ """
+ Returns the dictionairy form of this object. Usefull for re-converting to json later on.
+ """
+ return {
+ 'type': self.operator,
+ 'components': [comp.as_dict() if isinstance(comp, CustomContainer) else comp for comp in self.components],
+ }
+
+ def __repr__(self):
+ """Used for printing the resulting object, so you dont
+ see < __main__ CustomContainer object at x567df78fd>"""
+ return str(self.as_dict())
+
+def list_equality_no_order(arr1, arr2):
+ """In python [1, 2, 3] == [3, 2, 1] equals False.
+ In our case, we just want to check if two containers
+ have the same components. This function detects any
+ elements that do not occur in both. If none are found,
+ they are identical."""
+ base = arr1
+ for k in arr2:
+ try:
+ base.remove(k)
+ except ValueError:
+ # Can't remove because it isn't there
+ return False
+ # Return not (is there anything left in base). True if empty else False
+ return not base
+
+def find_duplicates(arr1, arr2):
+ # Yields any element that appears in both.
+ for k in arr1:
+ if k in arr2:
+ yield k
+
+def inverse_extend(arr1, deletions):
+ """Given a base array and a list of elements,
+ removes from the base array each
+ element in that list."""
+ for k in deletions:
+ try:
+ arr1.remove(k)
+ except ValueError:
+ raise ValueError("ValueError occurred in inverse_extend()")
+ return arr1
+
+def load_json(path, extensive=False):
+ # Loads json object from given path
+ with open(path) as file:
+ obj = file.read()
+ if extensive:
+ for id in rule_ids:
+ obj = obj.replace(rule_ids[id], str(id))
+ return json.loads(obj)
+
+def dump_json(obj, output_path):
+ # Dumps python dict object as json to given path
+ with open(output_path, "wt") as file:
+ json.dump(obj, file, indent=4)
+
+def display(data):
+ # Displays CustomContainer object as a dict
+ try:
+ print(json.dumps(data, indent=4))
+ except:
+ print(json.dumps(json.loads(data), indent=4))
+
+def arr_switch(arr:list, deletion, insertion):
+ """Maybe useless, but quite frequently had
+ to remove a and append b to a list, so
+ this function does just that.
+ Switches in given array a for b."""
+ arr.remove(deletion)
+ arr.append(insertion)
+ return arr
+
+def remove_duplicates(arr):
+ """Traditional set() doesn't work with
+ custom classes (CustomContainer in
+ this case), and removes it entirely."""
+ mem = []
+ for k in arr:
+ if k in mem:
+ pass
+ else:
+ mem.append(k)
+ yield k
+
+def container_count_op(container:CustomContainer, op):
+ # Returns amount of containers *with given operator* directly in another one.
+ c = 0
+ for e in container.components:
+ if isinstance(e, CustomContainer):
+ if e.operator == op:
+ c += 1
+ return c
+
+def contains_container(cont):
+ # Boolean of (cont has any containers directly inside ?)
+ return any(isinstance(i, CustomContainer) for i in cont.components)
diff --git a/src/os2datascanner/utils/optimiser/optimiser_usage.py b/src/os2datascanner/utils/optimiser/optimiser_usage.py
new file mode 100644
index 000000000..62b6d90dc
--- /dev/null
+++ b/src/os2datascanner/utils/optimiser/optimiser_usage.py
@@ -0,0 +1,13 @@
+from optimiser import *
+
+DIR = "/home/magenta/osdatascanner/src/os2datascanner/utils/optimiser/"
+IN_PATH = DIR + "original_rule.json"
+OUT_PATH = DIR + "output_rule.json"
+
+def clean_rule(rule_path, output_path):
+ rule_optimiser = RuleOptimiser(rule_path, output_path)
+ rule_optimiser.run_optimiser()
+ return rule_optimiser.clean_rule
+
+clean = clean_rule(IN_PATH, OUT_PATH)
+# print(clean)
\ No newline at end of file
diff --git a/src/os2datascanner/utils/optimiser/original_rule.json b/src/os2datascanner/utils/optimiser/original_rule.json
new file mode 100644
index 000000000..89fdf126d
--- /dev/null
+++ b/src/os2datascanner/utils/optimiser/original_rule.json
@@ -0,0 +1 @@
+{"type":"and","components":[{"type":"cpr","modulus_11":true,"ignore_irrelevant":true,"examine_context":true,"exceptions":""},{"type":"or","components":[{"type":"address","whitelist":[],"blacklist":[]},{"type":"cpr","modulus_11":true,"ignore_irrelevant":true,"examine_context":true,"exceptions":""},{"type":"name","whitelist":[],"blacklist":[],"expansive":false}]}]}
\ No newline at end of file
diff --git a/src/os2datascanner/utils/optimiser/output_rule.json b/src/os2datascanner/utils/optimiser/output_rule.json
new file mode 100644
index 000000000..fd628c0a0
--- /dev/null
+++ b/src/os2datascanner/utils/optimiser/output_rule.json
@@ -0,0 +1 @@
+{"type": "and", "components": [{"type": "cpr", "modulus_11": true, "ignore_irrelevant": true, "examine_context": true, "exceptions": ""}, {"type": "or", "components": [{"type": "address", "whitelist": [], "blacklist": []}, {"type": "name", "whitelist": [], "blacklist": [], "expansive": false}]}]}
\ No newline at end of file