Skip to content

Commit

Permalink
Add fix variations of mf runner examples #598
Browse files Browse the repository at this point in the history
Reuse all fix workflows from #654 and bring folders together as suggested by @blackwinter in #654 (comment) and in #662
  • Loading branch information
TobiasNx committed Feb 5, 2025
1 parent 16a41b0 commit 592aa08
Show file tree
Hide file tree
Showing 59 changed files with 1,780 additions and 31 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
//creates a beacon file based on a pica+ dump of the DNB CBS data.

default type = "ALL";
default out = dump + "-" + type + ".beacon";
default header = FLUX_DIR + "header.txt";


//read header
"reading header " + header | write("stdout");
header|open-file|as-lines|@Y;

//count references
"counting references in " + dump | write("stdout");

dump|
open-file|
as-lines|
catch-object-exception|
decode-pica|
batch-log(batchsize="100000")|
fix(FLUX_DIR + "extract.fix", *)|
stream-to-triples(redirect="true")|
sort-triples(by="subject")|
collect-triples|
fix(FLUX_DIR + "output.fix")|
batch-log("merged ${totalRecords}", batchsize="100000")|
stream-to-triples|
template("${s}")|
@Y;

@Y|
wait-for-inputs("2")|
write(out);



Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# 002@ not repeatable

if any_match("[email protected]", "^Tp.*$")
copy_field("[email protected]","ok")
end

# DBSM:
# (006U $0 “04p01*”) or (017A $a “yy”)
if any_match("006U.0","04p01.*")
add_field("@value","DBSM|ALL")
elsif any_equal("017A.a","yy")
add_field("@value","DBSM|ALL")

# DEA:
# (001@ $a 2””) or (209A $f “Exilarchiv” or 209A $f “HB/EB”)
elsif any_equal("[email protected]","2")
add_field("@value","DEA|ALL")
elsif any_equal("209A.f","HB/EB")
add_field("@value","DEA|ALL")

# DMA:
# (002@ $0 “G*” or 002@ $0 “M*”) or (006U $0 “10,P01*”)

elsif any_equal("[email protected]","^[GM].*")
add_field("@value","DMA|ALL")

elsif any_equal("006U.0","^10,P01.*")
add_field("@value","DMA|ALL")
else
add_field("@value","ALL")
end

# Test if type variable fits

if any_contain("@value","$[type]")
add_field("@value","$[type]")
else
remove_field("@value")
end

do list(path: "041A*|028A*|029B*|028C*|028Q*|028P*|028F*|028M*|028D*|028E*", "var":"$i")
trim("$i.9")
to_var("$i.9","ref")
if exists("$i.9")
copy_field("@value","{to:$[ref]}refed")
end
end

retain("{to*","ok")

Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
unless exists("refed")
remove_field("ok","")
end

unless exists("ok")
remove_field("ok","")
end

Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
default fileName = FLUX_DIR + "gnd-sample.pica";

fileName|
open-file|
as-lines|
decode-pica|
fix(FLUX_DIR + "gnd-type.fix")|
stream-to-triples|
count-triples(countBy="object")|
template("${s}\t${o}")|
write("stdout");
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
if any_match("[email protected]","...*")
replace_all("[email protected]","^(..).*","$1") #only keep the first two letters
retain("[email protected]") # only keep the relevent element
else
reject()
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
do list(path:"041A*|041A","var":"$i") # Until https://github.com/metafacture/metafacture-core/issues/651 is fixed one hass to add "041A"
copy_field("$i.9","relevantField.$append")
end

trim("relevantField.*")
uniq("relevantField")

retain("relevantField")
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

default counts="myflux/counts.dat";
default catalogue = FLUX_DIR + "10.pica";

//count references
"counting references in " + catalogue | write("stdout");

catalogue|
open-file|
as-lines|
catch-object-exception|
decode-pica|
fix(FLUX_DIR + "references.fix")|
stream-to-triples|
count-triples(countBy="object")|

write("subjects.dat");


Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ decode-pica|
morph(FLUX_DIR + "references.xml")|
stream-to-triples|
count-triples(countBy="object")|

write("subjects.dat");


9 changes: 9 additions & 0 deletions metafacture-runner/src/main/dist/examples/filter/filter.fix
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
if any_match("[email protected]","^Tp.*")
if any_match("041R.a",".*[Aa][Rr][Zz][Tt].*")
nothing()
else
reject()
end
else
reject()
end
11 changes: 11 additions & 0 deletions metafacture-runner/src/main/dist/examples/filter/filter.fix.flux
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// opens file 'fileName', interprets the content as pica and filters the results

default fileName = FLUX_DIR + "gnd-sample.pica";

fileName|
open-file|
as-lines|
decode-pica|
fix(FLUX_DIR + "filter.fix")| // Fix does not use the filter function but has its own filter mechanism within fix.
encode-formeta(style="verbose")|
write("stdout");
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ fileName|
open-file|
as-lines|
decode-pica|
filter(FLUX_DIR + "filter-morph.xml")|
filter(FLUX_DIR + "filter.xml")|
encode-formeta(style="verbose")|
write("stdout");
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
default base = "";
default dump = FLUX_DIR + "10.pica";
default out = base + "Ts1-Tg1-without-crisscross.txt";

"counting references in " + dump | write("stdout");

dump|
open-file|
as-lines|
catch-object-exception|
decode-pica|
batch-log(batchsize="100000")|
fix(FLUX_DIR + "extract.fix")|
stream-to-triples(redirect="true")|
sort-triples(by="subject")|
collect-triples|
fix(FLUX_DIR + "output.fix")|
batch-log(batchsize="100000")|
encode-csv(noquotes="true",separator=";")|
write(out);
Loading

0 comments on commit 592aa08

Please sign in to comment.