From 05f7c0daae89db50236bd370121e5f884459464f Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:00:17 +0100 Subject: [PATCH 1/3] Add an additional table to the database to log errors --- copietje/console.py | 2 +- copietje/download.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/copietje/console.py b/copietje/console.py index 76afd97..48d2476 100644 --- a/copietje/console.py +++ b/copietje/console.py @@ -99,7 +99,7 @@ def download(*, context, database, target=None, limit=None, condenser=None, jobs with context, sqlite3.connect(database) as database: database.row_factory = sqlite3.Row - database.cursor().execute(SCHEMA) + database.cursor().executescript(SCHEMA) # search hansken for the documents to download + minhash documents = context.search(Term('type', 'document'), count=limit) # issue bulk download with side effects to store all the documents and the minhashes diff --git a/copietje/download.py b/copietje/download.py index a650554..601dda0 100644 --- a/copietje/download.py +++ b/copietje/download.py @@ -18,7 +18,16 @@ tags TEXT, privileged_status TEXT, minhash BLOB - ) + ); + CREATE TABLE IF NOT EXISTS errors ( + uid TEXT, + -- equivalent to UNIXEPOCH(), which doesn't seem to be supported + ts INTEGER DEFAULT (CAST(strftime('%s', 'now') as INTEGER)), + stream TEXT, + privileged_status TEXT, + error TEXT, + PRIMARY KEY (uid, ts) + ); """ From dae9688fb86b3bec072d7298d68258a352d1f4a5 Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:00:42 +0100 Subject: [PATCH 2/3] Use on_error to log errors to the database --- copietje/console.py | 3 ++- copietje/download.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/copietje/console.py b/copietje/console.py index 48d2476..88d3af5 100644 --- a/copietje/console.py +++ b/copietje/console.py @@ -13,7 +13,7 @@ from tqdm import tqdm from copietje import Condenser -from copietje.download import add_metadata_to_db, determine_stream, SCHEMA +from copietje.download import add_metadata_to_db, determine_stream, log_error_to_db, SCHEMA from copietje.ranking import rank @@ -117,6 +117,7 @@ def download(*, context, database, target=None, limit=None, condenser=None, jobs export.bulk(documents, target, stream=partial(determine_stream, database=database), side_effect=partial(add_metadata_to_db, database=database, condenser=condenser), + on_error=partial(log_error_to_db, database=database), jobs=jobs) diff --git a/copietje/download.py b/copietje/download.py index 601dda0..cae5277 100644 --- a/copietje/download.py +++ b/copietje/download.py @@ -59,6 +59,22 @@ def determine_stream(trace, database=None): return selected +def log_error_to_db(database, trace, stream, exception=None, **_): + database.cursor().execute( + """ + INSERT INTO errors (uid, stream, privileged_status, error) + VALUES (?, ?, ?, ?) + """, + ( + trace.uid, + stream, + str(trace.privileged or '') or None, + str(exception) if exception else None, + ) + ) + database.commit() + + def add_metadata_to_db(database, trace, stream, output, condenser=None, **_): mh = None From 4d2082f049cf0cc69b238da5ad281357e0c9cd81 Mon Sep 17 00:00:00 2001 From: Mattijs Ugen <144798+akaIDIOT@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:05:45 +0100 Subject: [PATCH 3/3] Add comment to schema on compound primary key --- copietje/download.py | 1 + 1 file changed, 1 insertion(+) diff --git a/copietje/download.py b/copietje/download.py index cae5277..5ea7c6f 100644 --- a/copietje/download.py +++ b/copietje/download.py @@ -26,6 +26,7 @@ stream TEXT, privileged_status TEXT, error TEXT, + -- compound primary key to allow multiple failures of the same trace uid, differentiated by timestamp PRIMARY KEY (uid, ts) ); """