DATA & CHECKPOINT release and minor path fixes

SapienzaNLP · Apr 14, 2021 · b406b53 · b406b53
1 parent e65d00d
commit b406b53
Show file tree

Hide file tree

Showing 11 changed files with 29 additions and 21 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+.idea/*
+*.pyc
+__pycache__/
+*egg-info/
+*.swp
+*.pt
+*.th
+models/*
+!.gitignore
diff --git a/README.md b/README.md
@@ -20,18 +20,25 @@ If you find either our code or our release datasets useful in your work, please
 }
 ```
 
-#### If you need our silver data and/or XL-AMR checkpoints, please contact us by email! 
+#### If you need our silver data based on AMR 2.0 translations please contact us by email! 
+
+Download XL-AMR best checkpoints* per language here: [CHECKPOINTS](https://drive.google.com/drive/folders/1_tu6EJET20pi5IG3T-807hDpBjtkPs7W?usp=sharing)
+
+<sub>*Please take care of the paths in the config file according to your file structure in case of exceptions, before sending an email or opening an issue.
+
 
 ## 1. Install 
 
-Create a conda environment with **Python 3.6** and **PyTorch 1.5.0** and install the dependencies [requirements.txt](requirements.txt).
+Create a conda environment with **Python 3.6** and **PyTorch 1.5.0**, install the dependencies [requirements.txt](requirements.txt) and download the artifacts**.
 
 Via conda:
 
     conda create -n xlamr python=3.6
     source activate xlamr
     pip install -r requirements.txt
-
+    bash scripts/download_artifacts.sh    
+
+**Also please unzip all the zipped files you find inside the data folder before continuing with the other steps.
 
 ## 2. Gold Dataset
 1 - Download AMR 2.0 ([LDC2017T10](https://catalog.ldc.upenn.edu/LDC2017T10)) and AMR 2.0 - Four Translations ([LDC2020T07](https://catalog.ldc.upenn.edu/LDC2020T07)).

diff --git a/data/.gitignore b/data/.gitignore
diff --git a/data/AMR.zip b/data/AMR.zip
diff --git a/data/cross-lingual-babelnet_mappings.zip b/data/cross-lingual-babelnet_mappings.zip
diff --git a/data/misc.zip b/data/misc.zip
diff --git a/data/numberbatch.zip b/data/numberbatch.zip
diff --git a/xlamr_stog/data/data_misc/numberbatch_emb.py b/xlamr_stog/data/data_misc/numberbatch_emb.py
@@ -5,7 +5,7 @@
 
     outdir = "data/numberbatch"
     embeddings_path = os.path.join(outdir,'out_{}.txt')
-    with open(os.path.join(outdir, "numberbatch-19.08.en_it_es_de.txt"),"w", encoding="utf-8") as outfile:
+    with open(os.path.join(outdir, "numberbatch-19.08.en_it_es_de_zh.txt"),"w", encoding="utf-8") as outfile:
         for lang in ["en", "it", "es", "de","zh"]:
             for line in open(embeddings_path.format(lang),"r", encoding="utf-8"):
                 fields = line.rstrip().split()

diff --git a/xlamr_stog/data/dataset_readers/amr_parsing/amr.py b/xlamr_stog/data/dataset_readers/amr_parsing/amr.py
@@ -123,14 +123,14 @@ def __repr__(self):
                 try:
                     fields.append(str(v))
                 except:
-                    fields.append("BAD_GRAPH")
+                    fields.append("(b / BAD_GRAPH)")
             elif k == 'graph_pred':
                 if v==None: continue
                 try:
                     fields.append("\n#****GOLD_GRAPH*******\n")
                     fields.append(str(v))
                 except:
-                    fields.append("BAD_GRAPH")
+                    fields.append("(b / BAD_GRAPH)")
             else:
                 if not isinstance(v, str):
                     v = json.dumps(v)

diff --git a/xlamr_stog/data/dataset_readers/amr_parsing/preprocess/recategorizer_multilingual.py b/xlamr_stog/data/dataset_readers/amr_parsing/preprocess/recategorizer_multilingual.py
@@ -53,7 +53,7 @@ def __init__(self, train_data=None, build_utils=False, util_dir=None, lang="en",
             self.stemmer = None
         else:
             self.stemmer = nltk.stem.SnowballStemmer(code2lang[lang]).stem
-        self.stopwords = [x.rstrip().lower() for x in open("data/babelnet/stopwords_{}.txt".format(lang)).readlines()]
+        self.stopwords = [x.rstrip().lower() for x in open("data/cross-lingual-babelnet_mappings/stopwords_{}.txt".format(lang)).readlines()]
         self.train_data = train_data
         self.build_utils = build_utils
         self.named_entity_count = 0
@@ -356,7 +356,7 @@ def _update_utils(self, entities, amr):
 
     args = parser.parse_args()
     if args.lang != "en":
-        enNM2langNM = json.load(open(os.path.join("data/babelnet/", "name_en_{}_bn_map.json".format(args.lang)), "r"))
+        enNM2langNM = json.load(open(os.path.join("data/cross-lingual-babelnet_mappings/", "name_en_{}_bn_map.json".format(args.lang)), "r"))
     else:
         enNM2langNM = None
 

diff --git a/xlamr_stog/data/dataset_readers/amr_parsing/preprocess/text_anonymizor.py b/xlamr_stog/data/dataset_readers/amr_parsing/preprocess/text_anonymizor.py
@@ -359,17 +359,17 @@ def load_name_bn_map(json_file):
 
     return lang_nm2en_nm
 
-def load_name_bn_wiki_map(file, reliable_sources_lang):
+def load_name_bn_wiki_map(file):
     lang_nm2en_nm = dict()
 
     with open(file, "r", encoding='utf-8') as infile:
         for line in infile:
+            if line.startswith("#"): continue
             fields=line.rstrip().split()
             lang_nm = fields[2].replace("-","_")
 
             if lang_nm in lang_nm2en_nm: continue
             if lang_nm.islower(): continue
-            if fields[-1] not in reliable_sources_lang: continue
             if ":EN:" in fields[1]:
                 en_wiki = fields[1].split(":EN:")[-1]
             elif "#n#1" in fields[1]:
@@ -407,11 +407,6 @@ def load_name_span_map(json_file, lang):
     parser.add_argument('--exclude_ners', action="store_true", help="consider NER tags for entities not found in training.")
     args = parser.parse_args()
 
-    reliable_sources_lang = dict()
-    reliable_sources_lang["it"] = set("WIKI OMWN MSTERM FRAMENET VERBNET OMWN_IT IWN OMWIKI".split())
-    reliable_sources_lang["es"] = set("WIKI MSTERM FRAMENET VERBNET MRC_ES OMWIKI".split())
-    reliable_sources_lang["de"] = set("WIKI OMWN MSTERM FRAMENET VERBNET OMWIKI WIKIDATA".split())
-    reliable_sources_lang["zh"] = set("WIKI OMWN_ZH OMWN_CWN".split())
 
     if args.lang=="en":
         text_anonymizor = TextAnonymizor.from_json(os.path.join(args.util_dir,
@@ -422,10 +417,10 @@ def load_name_span_map(json_file, lang):
 
     else:
         text_anonymizor = TextAnonymizor.from_json(os.path.join(args.util_dir,"text_anonymization_en-{}.json".format(args.lang)))
-        lang_stopwords = set([x.rstrip() for x in open("data/babelnet/stopwords_{}.txt".format(args.lang))])
+        lang_stopwords = set([x.rstrip() for x in open("data/cross-lingual-babelnet_mappings/stopwords_{}.txt".format(args.lang))])
 
-        lang2en_span=load_name_span_map("data/babelnet/name_span_en_{}_map_amr_bn.json".format(args.lang), args.lang)
-        lang2en_bn=load_name_bn_wiki_map("data/babelnet/namedEntity_wiki_synsets.{}.tsv".format(args.lang.upper()), reliable_sources_lang[args.lang])
+        lang2en_span=load_name_span_map("data/cross-lingual-babelnet_mappings/name_span_en_{}_map_amr_bn.json".format(args.lang), args.lang)
+        lang2en_bn=load_name_bn_wiki_map("data/cross-lingual-babelnet_mappings/namedEntity_wiki_synsets.{}.tsv".format(args.lang.upper()))
 
     for amr_file in args.amr_file:
         with open(amr_file + ".recategorize{}".format("_noner" if args.exclude_ners else ""), "w", encoding="utf-8") as f: