update readme

ekinakyurek · Jun 7, 2021 · e7a44e1 · e7a44e1
commit e7a44e1
Show file tree

Hide file tree

Showing 92 changed files with 123,514 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,142 @@
+notes/*.aux
+notes/*.log
+notes/*.out
+notes/*.pdf
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "COGS"]
+	path = COGS
+	url = https://github.com/najoungkim/COGS.git
+[submodule "SCAN"]
+	path = SCAN
+	url = https://github.com/brendenlake/SCAN.git
diff --git a/COGS b/COGS
diff --git a/COLOR/alignments/forward.align.o.json b/COLOR/alignments/forward.align.o.json
@@ -0,0 +1 @@
+{"dax": {"RED": 9}, "lug": {"BLUE": 14}, "wif": {"GREEN": 12}, "zup": {"YELLOW": 1}}
diff --git a/COLOR/alignments/goodman.json b/COLOR/alignments/goodman.json
@@ -0,0 +1,7 @@
+{"dax": {"RED": 1995, "BLUE": 2, "GREEN": 10}, 
+"wif": {"GREEN": 1995, "BLUE": 116, "RED": 16}, 
+"lug": {"BLUE": 1995, "GREEN": 39, "RED": 5}, 
+"kiki": {"GREEN": 1030, "BLUE": 844, "RED": 17}, 
+"blicket": {"GREEN": 1738, "RED": 74, "BLUE": 23}, 
+"zup": {"YELLOW": 568}, 
+"fep": {"BLUE": 543, "RED": 46}}
diff --git a/COLOR/alignments/intersect.align.o.json b/COLOR/alignments/intersect.align.o.json
@@ -0,0 +1 @@
+{"dax": {"RED": 6}, "lug": {"BLUE": 9}, "wif": {"GREEN": 8}, "zup": {"YELLOW": 1}}
diff --git a/COLOR/alignments/pmi.align.json b/COLOR/alignments/pmi.align.json
@@ -0,0 +1 @@
+{"dax": {"RED": 58.53658318519592, "BLUE": 19.51219290494919, "GREEN": 21.95121943950653, "YELLOW": 5.853659601484651e-06}, "lug": {"RED": 22.22222238779068, "BLUE": 44.44444477558136, "GREEN": 33.33333432674408, "YELLOW": 4.444445522722162e-06}, "wif": {"RED": 23.076924681663513, "BLUE": 30.769231915473938, "GREEN": 46.153849363327026, "YELLOW": 4.615386117734488e-06}, "zup": {"RED": 1.666666982202969e-06, "BLUE": 1.111110314866437e-06, "GREEN": 1.2500009027860415e-06, "YELLOW": 100.0}, "fep": {"RED": 22.22221940755844, "BLUE": 44.44443881511688, "GREEN": 33.3333283662796, "YELLOW": 1.3333330173281865e-05}, "blicket": {"RED": 37.5, "BLUE": 24.99999850988388, "GREEN": 37.5, "YELLOW": 7.49999102822585e-06}, "kiki": {"RED": 27.906975150108337, "BLUE": 37.209299206733704, "GREEN": 34.88371968269348, "YELLOW": 5.581395967624303e-06}}
diff --git a/COLOR/alignments/simple.align.v3.json b/COLOR/alignments/simple.align.v3.json
@@ -0,0 +1 @@
+{"dax": {"RED": 6}, "lug": {"BLUE": 9}, "wif": {"GREEN": 8}, "zup": {"YELLOW": 1}}
diff --git a/COLOR/color.align.txt b/COLOR/color.align.txt
@@ -0,0 +1,14 @@
+dax ||| RED
+lug ||| BLUE
+wif ||| GREEN
+zup ||| YELLOW
+lug fep ||| BLUE BLUE BLUE
+dax fep ||| RED RED RED
+lug blicket wif ||| BLUE GREEN BLUE
+wif blicket dax ||| GREEN RED GREEN
+lug kiki wif ||| GREEN BLUE
+dax kiki lug ||| BLUE RED
+lug fep kiki wif ||| GREEN BLUE BLUE BLUE
+wif kiki dax blicket lug ||| RED BLUE RED GREEN
+lug kiki wif fep ||| GREEN GREEN GREEN BLUE
+wif blicket dax kiki lug ||| BLUE GREEN RED GREEN
diff --git a/COLOR/test.tsv b/COLOR/test.tsv
@@ -0,0 +1,10 @@
+zup fep 	  YELLOW YELLOW YELLOW
+zup blicket lug 	  YELLOW BLUE YELLOW
+dax blicket zup 	  RED YELLOW RED
+zup kiki dax 	  RED YELLOW
+wif kiki zup 	  YELLOW GREEN
+zup fep kiki lug 	  BLUE YELLOW YELLOW YELLOW
+wif kiki zup fep 	  YELLOW YELLOW YELLOW GREEN
+lug kiki wif blicket zup 	  GREEN YELLOW GREEN BLUE
+zup blicket wif kiki dax fep 	  RED RED RED YELLOW GREEN YELLOW
+zup blicket zup kiki zup fep 	  YELLOW YELLOW YELLOW YELLOW YELLOW YELLOW
diff --git a/COLOR/train.tsv b/COLOR/train.tsv
@@ -0,0 +1,14 @@
+dax 	  RED
+lug 	  BLUE
+wif 	  GREEN
+zup 	  YELLOW
+lug fep 	  BLUE BLUE BLUE
+dax fep 	  RED RED RED
+lug blicket wif 	  BLUE GREEN BLUE
+wif blicket dax 	  GREEN RED GREEN
+lug kiki wif 	  GREEN BLUE
+dax kiki lug 	  BLUE RED
+lug fep kiki wif 	  GREEN BLUE BLUE BLUE
+wif kiki dax blicket lug 	  RED BLUE RED GREEN
+lug kiki wif fep 	  GREEN GREEN GREEN BLUE
+wif blicket dax kiki lug 	  BLUE GREEN RED GREEN
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Ekin Akyürek
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,105 @@
+# Lexicon Learning for Few-Shot Neural Sequence Modeling
+Paper: [Lexicon Learning for Few-Shot Neural Sequence Modeling](https://drive.google.com/file/d/1jDCXL4MT_rLN2X2QNHFae90r17w3HM3g/view?usp=sharing)    
+Ekin Akyürek, Jacob Andreas ACL 2021
+
+![Lexicon Model](lexicon_model.png "Lexicon Model")
+
+
+## Dependencies
+- **OS**: Linux or macOS
+- **Language**: Python
+- **Hardware**:  NVIDIA GPU (CUDA and cuDNN)
+- **Libraries**: PyTorch, numpy, nltk
+- **Optional**:
+  - Jupyter Notebook (Used for analysis of results.)
+
+
+## Requirements
+
+You can use provided [environment.yml](./environment.yml) to setup the required libraries using `conda` software.
+
+### Info:
+
+Exact versions of the important libraries I used for this project.   
+
+1. python=3.7.3
+2. pytorch=1.2.0 (cuda10.0.130_cudnn7.6.2_0)
+3. numpy=1.19.1
+5. nltk==3.5
+
+Note that since this codebase is for reproducibility purposes you might require specific versions of the dependencies as described above. However, it should work with higher versions as well.
+
+## Setup
+
+You can setup this repo by typing below in shell:
+
+```SHELL
+
+git clone --recurse-submodules git://github.com/ekinakyurek/lexical.git
+cd lexical
+conda env create --file environment.yml # creates conda env with required packages
+```
+
+## Data
+
+[COGS](./COGS) and [SCAN](./SCAN) datasets are provided as a submodule.   
+[TRANSLATE](./TRANSLATE) and [COLOR](./COLOR) datasets are provided as a subfolder.
+
+
+> 📋 See individual license files for each dataset under their folders.
+
+## Training
+
+To verify the results presented in the paper, you may run the scripts to train models and see the evaluations.
+
+**Lexicon Learning (optional)**:
+
+We provide the required lexicon files in the repo. For those who are interested in this part:
+
+You need formatted training files with following structure ([ref](https://github.com/clab/fast_align))
+
+> Each line is a source language sentence and its target language translation, separated by a triple pipe symbol with leading and trailing white space (|||).
+
+Then you can extract all the lexicons by running the script:
+
+```SHELL
+sh extract_alignments.sh
+```
+
+**Seq2Seq Training**:
+
+All experiment scripts are found at [exps/](exps/)
+
+For example,
+
+- To run the *simple* model on COGS dataset you can use:
+```SHELL
+cd exp/COGS
+sh simple.sh $i #  $i is the seed of the experiments. You can use  `sbatch simple.sh` for running all exps parallel on slurm
+```
+
+- To run the *IBM2* model on COGS dataset you can use:
+```SHELL
+cd exp/COGS
+sh fast.sh $i
+```
+
+The logs can be found in the created subfolders.
+
+> 📋 Note that the experiments are tested on NVIDIA 32GB V100 Volta GPUs. For some models GPU requirements might be high.
+
+> 📋 jump(SCAN) and Color experiments are very sensitive to seeds, so any change in the code might change the results sligthly.
+
+## Evaluation
+
+After running an experiment, evaluation results can be found under subfolders, we provide convenience scripts in each exp folder which collates the results in shell:
+
+```SHELL
+sh collect.sh   # (WIP: some of them needs to updated slightly)
+```
+
+After running all experiments, one can refer to `analyze.ipynb` Jupyter Notebook to obtain the figures and tables provided in the paper.
+
+## Trouble Shooting
+
+TODO
diff --git a/SCAN b/SCAN
diff --git a/TRANSLATE/README.md b/TRANSLATE/README.md
@@ -0,0 +1,19 @@
+## Attribution / License
+
+- We use the [data](./cmn.txt) obtained from [ManyThings.org](https://www.manythings.org/bilingual/) which collected this data from tatoeba.org corpus.   
+
+- The data is licensed under the [Creative Commons - Attribution 2.0](https://creativecommons.org/licenses/by/2.0/fr/deed.en_GB) France license. ([terms of use page on tatoeba.org](http://tatoeba.org/eng/terms_of_use))
+
+## Copyright
+
+- Though the source data is available under the CC BY license, the downloaded [datafile](./cmn.txt) on this repository have been edited by [ManyThings.org](https://www.manythings.org/bilingual/) and are copyrighted.
+
+## Tokenized Files
+
+You can find tokenized files produced using original data under this folder `*.tsv`. We use these files in our experiments.
+
+
+
+
+
+
diff --git a/TRANSLATE/alignments/forward.align.o.json b/TRANSLATE/alignments/forward.align.o.json
diff --git a/TRANSLATE/alignments/intersect.align.o.json b/TRANSLATE/alignments/intersect.align.o.json
diff --git a/TRANSLATE/alignments/simple.align.v3.json b/TRANSLATE/alignments/simple.align.v3.json
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"dax": {"RED": 9}, "lug": {"BLUE": 14}, "wif": {"GREEN": 12}, "zup": {"YELLOW": 1}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"dax": {"RED": 6}, "lug": {"BLUE": 9}, "wif": {"GREEN": 8}, "zup": {"YELLOW": 1}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"dax": {"RED": 58.53658318519592, "BLUE": 19.51219290494919, "GREEN": 21.95121943950653, "YELLOW": 5.853659601484651e-06}, "lug": {"RED": 22.22222238779068, "BLUE": 44.44444477558136, "GREEN": 33.33333432674408, "YELLOW": 4.444445522722162e-06}, "wif": {"RED": 23.076924681663513, "BLUE": 30.769231915473938, "GREEN": 46.153849363327026, "YELLOW": 4.615386117734488e-06}, "zup": {"RED": 1.666666982202969e-06, "BLUE": 1.111110314866437e-06, "GREEN": 1.2500009027860415e-06, "YELLOW": 100.0}, "fep": {"RED": 22.22221940755844, "BLUE": 44.44443881511688, "GREEN": 33.3333283662796, "YELLOW": 1.3333330173281865e-05}, "blicket": {"RED": 37.5, "BLUE": 24.99999850988388, "GREEN": 37.5, "YELLOW": 7.49999102822585e-06}, "kiki": {"RED": 27.906975150108337, "BLUE": 37.209299206733704, "GREEN": 34.88371968269348, "YELLOW": 5.581395967624303e-06}}