diff --git a/.gitignore b/.gitignore index e69de29..8094892 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1,9 @@ +.idea/* +*.pyc +__pycache__/ +*egg-info/ +*.swp +*.pt +*.th +models/* +!.gitignore \ No newline at end of file diff --git a/README.md b/README.md index 3596bbd..477dbb5 100644 --- a/README.md +++ b/README.md @@ -20,18 +20,25 @@ If you find either our code or our release datasets useful in your work, please } ``` -#### If you need our silver data and/or XL-AMR checkpoints, please contact us by email! +#### If you need our silver data based on AMR 2.0 translations please contact us by email! + +Download XL-AMR best checkpoints* per language here: [CHECKPOINTS](https://drive.google.com/drive/folders/1_tu6EJET20pi5IG3T-807hDpBjtkPs7W?usp=sharing) + +<sub>*Please take care of the paths in the config file according to your file structure in case of exceptions, before sending an email or opening an issue. + ## 1. Install -Create a conda environment with **Python 3.6** and **PyTorch 1.5.0** and install the dependencies [requirements.txt](requirements.txt). +Create a conda environment with **Python 3.6** and **PyTorch 1.5.0**, install the dependencies [requirements.txt](requirements.txt) and download the artifacts**. Via conda: conda create -n xlamr python=3.6 source activate xlamr pip install -r requirements.txt - + bash scripts/download_artifacts.sh + +**Also please unzip all the zipped files you find inside the data folder before continuing with the other steps. ## 2. Gold Dataset 1 - Download AMR 2.0 ([LDC2017T10](https://catalog.ldc.upenn.edu/LDC2017T10)) and AMR 2.0 - Four Translations ([LDC2020T07](https://catalog.ldc.upenn.edu/LDC2020T07)). diff --git a/data/.gitignore b/data/.gitignore deleted file mode 100644 index 94548af..0000000 --- a/data/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -* -*/ -!.gitignore diff --git a/data/AMR.zip b/data/AMR.zip new file mode 100644 index 0000000..6e71032 Binary files /dev/null and b/data/AMR.zip differ diff --git a/data/cross-lingual-babelnet_mappings.zip b/data/cross-lingual-babelnet_mappings.zip new file mode 100644 index 0000000..f851a22 Binary files /dev/null and b/data/cross-lingual-babelnet_mappings.zip differ diff --git a/data/misc.zip b/data/misc.zip new file mode 100644 index 0000000..50bbbf3 Binary files /dev/null and b/data/misc.zip differ diff --git a/data/numberbatch.zip b/data/numberbatch.zip new file mode 100644 index 0000000..ca48716 Binary files /dev/null and b/data/numberbatch.zip differ diff --git a/xlamr_stog/data/data_misc/numberbatch_emb.py b/xlamr_stog/data/data_misc/numberbatch_emb.py index b3ff371..e072d11 100644 --- a/xlamr_stog/data/data_misc/numberbatch_emb.py +++ b/xlamr_stog/data/data_misc/numberbatch_emb.py @@ -5,7 +5,7 @@ outdir = "data/numberbatch" embeddings_path = os.path.join(outdir,'out_{}.txt') - with open(os.path.join(outdir, "numberbatch-19.08.en_it_es_de.txt"),"w", encoding="utf-8") as outfile: + with open(os.path.join(outdir, "numberbatch-19.08.en_it_es_de_zh.txt"),"w", encoding="utf-8") as outfile: for lang in ["en", "it", "es", "de","zh"]: for line in open(embeddings_path.format(lang),"r", encoding="utf-8"): fields = line.rstrip().split() diff --git a/xlamr_stog/data/dataset_readers/amr_parsing/amr.py b/xlamr_stog/data/dataset_readers/amr_parsing/amr.py index 1d4c223..716e2bb 100644 --- a/xlamr_stog/data/dataset_readers/amr_parsing/amr.py +++ b/xlamr_stog/data/dataset_readers/amr_parsing/amr.py @@ -123,14 +123,14 @@ def __repr__(self): try: fields.append(str(v)) except: - fields.append("BAD_GRAPH") + fields.append("(b / BAD_GRAPH)") elif k == 'graph_pred': if v==None: continue try: fields.append("\n#****GOLD_GRAPH*******\n") fields.append(str(v)) except: - fields.append("BAD_GRAPH") + fields.append("(b / BAD_GRAPH)") else: if not isinstance(v, str): v = json.dumps(v) diff --git a/xlamr_stog/data/dataset_readers/amr_parsing/preprocess/recategorizer_multilingual.py b/xlamr_stog/data/dataset_readers/amr_parsing/preprocess/recategorizer_multilingual.py index bf3c557..d5677c9 100644 --- a/xlamr_stog/data/dataset_readers/amr_parsing/preprocess/recategorizer_multilingual.py +++ b/xlamr_stog/data/dataset_readers/amr_parsing/preprocess/recategorizer_multilingual.py @@ -53,7 +53,7 @@ def __init__(self, train_data=None, build_utils=False, util_dir=None, lang="en", self.stemmer = None else: self.stemmer = nltk.stem.SnowballStemmer(code2lang[lang]).stem - self.stopwords = [x.rstrip().lower() for x in open("data/babelnet/stopwords_{}.txt".format(lang)).readlines()] + self.stopwords = [x.rstrip().lower() for x in open("data/cross-lingual-babelnet_mappings/stopwords_{}.txt".format(lang)).readlines()] self.train_data = train_data self.build_utils = build_utils self.named_entity_count = 0 @@ -356,7 +356,7 @@ def _update_utils(self, entities, amr): args = parser.parse_args() if args.lang != "en": - enNM2langNM = json.load(open(os.path.join("data/babelnet/", "name_en_{}_bn_map.json".format(args.lang)), "r")) + enNM2langNM = json.load(open(os.path.join("data/cross-lingual-babelnet_mappings/", "name_en_{}_bn_map.json".format(args.lang)), "r")) else: enNM2langNM = None diff --git a/xlamr_stog/data/dataset_readers/amr_parsing/preprocess/text_anonymizor.py b/xlamr_stog/data/dataset_readers/amr_parsing/preprocess/text_anonymizor.py index 8806569..cf553f3 100644 --- a/xlamr_stog/data/dataset_readers/amr_parsing/preprocess/text_anonymizor.py +++ b/xlamr_stog/data/dataset_readers/amr_parsing/preprocess/text_anonymizor.py @@ -359,17 +359,17 @@ def load_name_bn_map(json_file): return lang_nm2en_nm -def load_name_bn_wiki_map(file, reliable_sources_lang): +def load_name_bn_wiki_map(file): lang_nm2en_nm = dict() with open(file, "r", encoding='utf-8') as infile: for line in infile: + if line.startswith("#"): continue fields=line.rstrip().split() lang_nm = fields[2].replace("-","_") if lang_nm in lang_nm2en_nm: continue if lang_nm.islower(): continue - if fields[-1] not in reliable_sources_lang: continue if ":EN:" in fields[1]: en_wiki = fields[1].split(":EN:")[-1] elif "#n#1" in fields[1]: @@ -407,11 +407,6 @@ def load_name_span_map(json_file, lang): parser.add_argument('--exclude_ners', action="store_true", help="consider NER tags for entities not found in training.") args = parser.parse_args() - reliable_sources_lang = dict() - reliable_sources_lang["it"] = set("WIKI OMWN MSTERM FRAMENET VERBNET OMWN_IT IWN OMWIKI".split()) - reliable_sources_lang["es"] = set("WIKI MSTERM FRAMENET VERBNET MRC_ES OMWIKI".split()) - reliable_sources_lang["de"] = set("WIKI OMWN MSTERM FRAMENET VERBNET OMWIKI WIKIDATA".split()) - reliable_sources_lang["zh"] = set("WIKI OMWN_ZH OMWN_CWN".split()) if args.lang=="en": text_anonymizor = TextAnonymizor.from_json(os.path.join(args.util_dir, @@ -422,10 +417,10 @@ def load_name_span_map(json_file, lang): else: text_anonymizor = TextAnonymizor.from_json(os.path.join(args.util_dir,"text_anonymization_en-{}.json".format(args.lang))) - lang_stopwords = set([x.rstrip() for x in open("data/babelnet/stopwords_{}.txt".format(args.lang))]) + lang_stopwords = set([x.rstrip() for x in open("data/cross-lingual-babelnet_mappings/stopwords_{}.txt".format(args.lang))]) - lang2en_span=load_name_span_map("data/babelnet/name_span_en_{}_map_amr_bn.json".format(args.lang), args.lang) - lang2en_bn=load_name_bn_wiki_map("data/babelnet/namedEntity_wiki_synsets.{}.tsv".format(args.lang.upper()), reliable_sources_lang[args.lang]) + lang2en_span=load_name_span_map("data/cross-lingual-babelnet_mappings/name_span_en_{}_map_amr_bn.json".format(args.lang), args.lang) + lang2en_bn=load_name_bn_wiki_map("data/cross-lingual-babelnet_mappings/namedEntity_wiki_synsets.{}.tsv".format(args.lang.upper())) for amr_file in args.amr_file: with open(amr_file + ".recategorize{}".format("_noner" if args.exclude_ners else ""), "w", encoding="utf-8") as f: