Skip to content

Commit

Permalink
DATA & CHECKPOINT release and minor path fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
rexhinab authored and rexhinab committed Apr 14, 2021
1 parent e65d00d commit b406b53
Show file tree
Hide file tree
Showing 11 changed files with 29 additions and 21 deletions.
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
.idea/*
*.pyc
__pycache__/
*egg-info/
*.swp
*.pt
*.th
models/*
!.gitignore
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,25 @@ If you find either our code or our release datasets useful in your work, please
}
```

#### If you need our silver data and/or XL-AMR checkpoints, please contact us by email!
#### If you need our silver data based on AMR 2.0 translations please contact us by email!

Download XL-AMR best checkpoints* per language here: [CHECKPOINTS](https://drive.google.com/drive/folders/1_tu6EJET20pi5IG3T-807hDpBjtkPs7W?usp=sharing)

<sub>*Please take care of the paths in the config file according to your file structure in case of exceptions, before sending an email or opening an issue.


## 1. Install

Create a conda environment with **Python 3.6** and **PyTorch 1.5.0** and install the dependencies [requirements.txt](requirements.txt).
Create a conda environment with **Python 3.6** and **PyTorch 1.5.0**, install the dependencies [requirements.txt](requirements.txt) and download the artifacts**.

Via conda:

conda create -n xlamr python=3.6
source activate xlamr
pip install -r requirements.txt

bash scripts/download_artifacts.sh

**Also please unzip all the zipped files you find inside the data folder before continuing with the other steps.

## 2. Gold Dataset
1 - Download AMR 2.0 ([LDC2017T10](https://catalog.ldc.upenn.edu/LDC2017T10)) and AMR 2.0 - Four Translations ([LDC2020T07](https://catalog.ldc.upenn.edu/LDC2020T07)).
Expand Down
3 changes: 0 additions & 3 deletions data/.gitignore

This file was deleted.

Binary file added data/AMR.zip
Binary file not shown.
Binary file added data/cross-lingual-babelnet_mappings.zip
Binary file not shown.
Binary file added data/misc.zip
Binary file not shown.
Binary file added data/numberbatch.zip
Binary file not shown.
2 changes: 1 addition & 1 deletion xlamr_stog/data/data_misc/numberbatch_emb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

outdir = "data/numberbatch"
embeddings_path = os.path.join(outdir,'out_{}.txt')
with open(os.path.join(outdir, "numberbatch-19.08.en_it_es_de.txt"),"w", encoding="utf-8") as outfile:
with open(os.path.join(outdir, "numberbatch-19.08.en_it_es_de_zh.txt"),"w", encoding="utf-8") as outfile:
for lang in ["en", "it", "es", "de","zh"]:
for line in open(embeddings_path.format(lang),"r", encoding="utf-8"):
fields = line.rstrip().split()
Expand Down
4 changes: 2 additions & 2 deletions xlamr_stog/data/dataset_readers/amr_parsing/amr.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,14 +123,14 @@ def __repr__(self):
try:
fields.append(str(v))
except:
fields.append("BAD_GRAPH")
fields.append("(b / BAD_GRAPH)")
elif k == 'graph_pred':
if v==None: continue
try:
fields.append("\n#****GOLD_GRAPH*******\n")
fields.append(str(v))
except:
fields.append("BAD_GRAPH")
fields.append("(b / BAD_GRAPH)")
else:
if not isinstance(v, str):
v = json.dumps(v)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self, train_data=None, build_utils=False, util_dir=None, lang="en",
self.stemmer = None
else:
self.stemmer = nltk.stem.SnowballStemmer(code2lang[lang]).stem
self.stopwords = [x.rstrip().lower() for x in open("data/babelnet/stopwords_{}.txt".format(lang)).readlines()]
self.stopwords = [x.rstrip().lower() for x in open("data/cross-lingual-babelnet_mappings/stopwords_{}.txt".format(lang)).readlines()]
self.train_data = train_data
self.build_utils = build_utils
self.named_entity_count = 0
Expand Down Expand Up @@ -356,7 +356,7 @@ def _update_utils(self, entities, amr):

args = parser.parse_args()
if args.lang != "en":
enNM2langNM = json.load(open(os.path.join("data/babelnet/", "name_en_{}_bn_map.json".format(args.lang)), "r"))
enNM2langNM = json.load(open(os.path.join("data/cross-lingual-babelnet_mappings/", "name_en_{}_bn_map.json".format(args.lang)), "r"))
else:
enNM2langNM = None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -359,17 +359,17 @@ def load_name_bn_map(json_file):

return lang_nm2en_nm

def load_name_bn_wiki_map(file, reliable_sources_lang):
def load_name_bn_wiki_map(file):
lang_nm2en_nm = dict()

with open(file, "r", encoding='utf-8') as infile:
for line in infile:
if line.startswith("#"): continue
fields=line.rstrip().split()
lang_nm = fields[2].replace("-","_")

if lang_nm in lang_nm2en_nm: continue
if lang_nm.islower(): continue
if fields[-1] not in reliable_sources_lang: continue
if ":EN:" in fields[1]:
en_wiki = fields[1].split(":EN:")[-1]
elif "#n#1" in fields[1]:
Expand Down Expand Up @@ -407,11 +407,6 @@ def load_name_span_map(json_file, lang):
parser.add_argument('--exclude_ners', action="store_true", help="consider NER tags for entities not found in training.")
args = parser.parse_args()

reliable_sources_lang = dict()
reliable_sources_lang["it"] = set("WIKI OMWN MSTERM FRAMENET VERBNET OMWN_IT IWN OMWIKI".split())
reliable_sources_lang["es"] = set("WIKI MSTERM FRAMENET VERBNET MRC_ES OMWIKI".split())
reliable_sources_lang["de"] = set("WIKI OMWN MSTERM FRAMENET VERBNET OMWIKI WIKIDATA".split())
reliable_sources_lang["zh"] = set("WIKI OMWN_ZH OMWN_CWN".split())

if args.lang=="en":
text_anonymizor = TextAnonymizor.from_json(os.path.join(args.util_dir,
Expand All @@ -422,10 +417,10 @@ def load_name_span_map(json_file, lang):

else:
text_anonymizor = TextAnonymizor.from_json(os.path.join(args.util_dir,"text_anonymization_en-{}.json".format(args.lang)))
lang_stopwords = set([x.rstrip() for x in open("data/babelnet/stopwords_{}.txt".format(args.lang))])
lang_stopwords = set([x.rstrip() for x in open("data/cross-lingual-babelnet_mappings/stopwords_{}.txt".format(args.lang))])

lang2en_span=load_name_span_map("data/babelnet/name_span_en_{}_map_amr_bn.json".format(args.lang), args.lang)
lang2en_bn=load_name_bn_wiki_map("data/babelnet/namedEntity_wiki_synsets.{}.tsv".format(args.lang.upper()), reliable_sources_lang[args.lang])
lang2en_span=load_name_span_map("data/cross-lingual-babelnet_mappings/name_span_en_{}_map_amr_bn.json".format(args.lang), args.lang)
lang2en_bn=load_name_bn_wiki_map("data/cross-lingual-babelnet_mappings/namedEntity_wiki_synsets.{}.tsv".format(args.lang.upper()))

for amr_file in args.amr_file:
with open(amr_file + ".recategorize{}".format("_noner" if args.exclude_ners else ""), "w", encoding="utf-8") as f:
Expand Down

0 comments on commit b406b53

Please sign in to comment.