diff --git a/Makefile b/Makefile index ac6e9fa604..7d8ca4d74f 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ install: install-for-tests: @echo "--- ๐Ÿš€ Installing project dependencies for test ---" @echo "This ensures that the project is not installed in editable mode" - pip install ".[dev,speedtask]" + pip install ".[dev,speedtask,bm25s,pylate]" lint: @echo "--- ๐Ÿงน Running linters ---" @@ -45,5 +45,5 @@ serve-docs: model-load-test: @echo "--- ๐Ÿš€ Running model load test ---" pip install ".[dev, speedtask, pylate,gritlm,xformers,model2vec]" - python scripts/extract_model_names.py $(BASE_BRANCH) + python scripts/extract_model_names.py $(BASE_BRANCH) --return_one_model_name_per_file python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt \ No newline at end of file diff --git a/docs/mmteb/points_table.md b/docs/mmteb/points_table.md index 37f7c77258..cd166890d7 100644 --- a/docs/mmteb/points_table.md +++ b/docs/mmteb/points_table.md @@ -2,103 +2,103 @@ _Note_: this table is **autogenerated** and should not be edited. It is intended to get an overview of contributions. - | GitHub | Paper writing | New dataset | Review PR | Bug fixes | Coordination | Dataset annotations | New task | Running Models | Total | -|:------------------|----------------:|--------------:|------------:|------------:|---------------:|----------------------:|-----------:|-----------------:|--------:| -| KennethEnevoldsen | 0 | 68 | 326 | 87 | 81 | 35 | 0 | 0 | 597 | -| isaac-chung | 12 | 120 | 194 | 50 | 54 | 1 | 2 | 0 | 433 | -| imenelydiaker | 0 | 120 | 144 | 24 | 70 | 0 | 0 | 0 | 358 | -| awinml | 0 | 300 | 2 | 0 | 0 | 0 | 0 | 0 | 302 | -| x-tabdeveloping | 0 | 144 | 32 | 10 | 41 | 0 | 12 | 0 | 239 | -| davidstap | 0 | 176 | 0 | 0 | 0 | 0 | 0 | 0 | 176 | -| jaygala24 | 0 | 149 | 0 | 0 | 0 | 0 | 0 | 0 | 149 | -| wissam-sib | 0 | 134 | 6 | 4 | 0 | 0 | 0 | 0 | 144 | -| Muennighoff | 0 | 0 | 48 | 0 | 70 | 0 | 0 | 24 | 142 | -| orionw | 0 | 0 | 20 | 20 | 75 | 0 | 10 | 0 | 125 | -| dokato | 0 | 94 | 6 | 12 | 0 | 0 | 0 | 0 | 112 | -| gentaiscool | 0 | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 110 | -| jupyterjazz | 0 | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 108 | -| SaitejaUtpala | 0 | 102 | 0 | 0 | 0 | 0 | 0 | 0 | 102 | -| vaibhavad | 0 | 6 | 4 | 8 | 75 | 0 | 0 | 0 | 93 | -| schmarion | 0 | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| MathieuCiancone | 0 | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| GabrielSequeira | 0 | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| digantamisra98 | 0 | 71 | 0 | 0 | 0 | 0 | 0 | 0 | 71 | -| shreeya-dhakal | 0 | 54 | 8 | 0 | 0 | 0 | 0 | 0 | 62 | -| Rysias | 0 | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 58 | -| Samoed | 0 | 18 | 2 | 22 | 0 | 0 | 0 | 9 | 51 | -| sivareddyg | 0 | 0 | 0 | 0 | 50 | 0 | 0 | 0 | 50 | -| gowitheflow-1998 | 0 | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 50 | -| asparius | 0 | 34 | 14 | 0 | 0 | 0 | 0 | 0 | 48 | -| Akash190104 | 0 | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 46 | -| MartinBernstorff | 0 | 2 | 8 | 13 | 20 | 0 | 0 | 0 | 43 | -| akshita-sukhlecha | 0 | 36 | 0 | 4 | 0 | 0 | 0 | 0 | 40 | -| staoxiao | 0 | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 40 | -| bp-high | 0 | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | -| rafalposwiata | 0 | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | -| KranthiGV | 0 | 20 | 14 | 0 | 0 | 0 | 0 | 0 | 34 | -| loicmagne | 0 | 0 | 0 | 28 | 0 | 0 | 0 | 0 | 28 | -| ShawonAshraf | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| bjoernpl | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| jphme | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| rasdani | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| violenil | 0 | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | -| mariyahendriksen | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | -| dwzhu-pku | 0 | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | -| hgissbkh | 3 | 0 | 2 | 13 | 0 | 0 | 5 | 0 | 23 | -| taeminlee | 0 | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| kwojtasi | 0 | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| jankounchained | 0 | 14 | 0 | 8 | 0 | 0 | 0 | 0 | 22 | -| tomaarsen | 0 | 0 | 2 | 0 | 20 | 0 | 0 | 0 | 22 | -| crystina-z | 0 | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | -| mrshu | 0 | 16 | 4 | 0 | 0 | 1 | 0 | 0 | 21 | -| john-b-yang | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| rbroc | 0 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| mmhamdy | 0 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| ManuelFay | 0 | 2 | 0 | 13 | 0 | 0 | 5 | 0 | 20 | -| AlexeyVatolin | 0 | 0 | 0 | 20 | 0 | 0 | 0 | 0 | 20 | -| Andrian0s | 0 | 14 | 4 | 2 | 0 | 0 | 0 | 0 | 20 | -| thakur-nandan | 0 | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| manandey | 0 | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| PranjalChitale | 0 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| dipam7 | 0 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 16 | -| sted97 | 0 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| Sakshamrzt | 0 | 12 | 4 | 0 | 0 | 0 | 0 | 0 | 16 | -| taidnguyen | 0 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| artemsnegirev | 0 | 12 | 0 | 0 | 0 | 2 | 0 | 0 | 14 | -| slvnwhrl | 0 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| anpalmak2003 | 0 | 9 | 0 | 0 | 0 | 3 | 0 | 0 | 12 | -| Art3mis07 | 0 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| guenthermi | 0 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| jordiclive | 0 | 2 | 0 | 10 | 0 | 0 | 0 | 0 | 12 | -| xhluca | 0 | 6 | 2 | 4 | 0 | 0 | 0 | 0 | 12 | -| henilp105 | 0 | 0 | 0 | 2 | 0 | 9 | 0 | 0 | 11 | -| MariyaTikhonova | 0 | 7 | 0 | 0 | 0 | 4 | 0 | 0 | 11 | -| ab1992ao | 0 | 8 | 0 | 0 | 0 | 3 | 0 | 0 | 11 | -| tmp_handle | 0 | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 10 | -| swj0419 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| Ruqyai | 0 | 2 | 8 | 0 | 0 | 0 | 0 | 0 | 10 | -| ZhengLiu101 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| Alenush | 0 | 6 | 0 | 0 | 0 | 4 | 0 | 0 | 10 | -| ABorghini | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| simon-clematide | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| sarahooker | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| guangyusong | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| HLasse | 0 | 0 | 0 | 5 | 0 | 5 | 0 | 0 | 10 | -| cassanof | 0 | 8 | 0 | 1 | 0 | 0 | 0 | 1 | 10 | -| hongjin-su | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| xiamengzhou | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| xu3kev | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| howard-yen | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| malteos | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| ljvmiranda921 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| marcobellagente93 | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| izhx | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| MexicanLemonade | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| antoniolanza1996 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 | -| achibb | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| NouamaneTazi | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | -| PhilipMay | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | -| cslizc | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| bakrianoo | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| hanhainebula | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| monikernemo | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | \ No newline at end of file + | GitHub | New dataset | Review PR | Bug fixes | Coordination | Paper writing | Dataset annotations | Running Models | New task | Total | +|:------------------|--------------:|------------:|------------:|---------------:|----------------:|----------------------:|-----------------:|-----------:|--------:| +| KennethEnevoldsen | 68 | 326 | 87 | 81 | 0 | 35 | 0 | 0 | 597 | +| isaac-chung | 120 | 194 | 50 | 54 | 12 | 1 | 0 | 2 | 433 | +| imenelydiaker | 120 | 144 | 24 | 70 | 0 | 0 | 0 | 0 | 358 | +| awinml | 300 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 302 | +| x-tabdeveloping | 144 | 32 | 10 | 41 | 0 | 0 | 0 | 12 | 239 | +| davidstap | 176 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 176 | +| jaygala24 | 149 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 149 | +| wissam-sib | 134 | 6 | 4 | 0 | 0 | 0 | 0 | 0 | 144 | +| Muennighoff | 0 | 48 | 0 | 70 | 0 | 0 | 24 | 0 | 142 | +| orionw | 0 | 20 | 20 | 75 | 0 | 0 | 0 | 10 | 125 | +| dokato | 94 | 6 | 12 | 0 | 0 | 0 | 0 | 0 | 112 | +| gentaiscool | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 110 | +| jupyterjazz | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 108 | +| SaitejaUtpala | 102 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 102 | +| vaibhavad | 6 | 4 | 8 | 75 | 0 | 0 | 0 | 0 | 93 | +| schmarion | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | +| MathieuCiancone | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | +| GabrielSequeira | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | +| digantamisra98 | 71 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 71 | +| shreeya-dhakal | 54 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 62 | +| Rysias | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 58 | +| Samoed | 18 | 2 | 22 | 0 | 0 | 0 | 9 | 0 | 51 | +| sivareddyg | 0 | 0 | 0 | 50 | 0 | 0 | 0 | 0 | 50 | +| gowitheflow-1998 | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 50 | +| asparius | 34 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 48 | +| Akash190104 | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 46 | +| MartinBernstorff | 2 | 8 | 13 | 20 | 0 | 0 | 0 | 0 | 43 | +| akshita-sukhlecha | 36 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 40 | +| staoxiao | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 40 | +| bp-high | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | +| rafalposwiata | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | +| KranthiGV | 20 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 34 | +| loicmagne | 0 | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 28 | +| ShawonAshraf | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| bjoernpl | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| jphme | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| rasdani | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| violenil | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | +| mariyahendriksen | 0 | 0 | 0 | 0 | 24 | 0 | 0 | 0 | 24 | +| dwzhu-pku | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | +| hgissbkh | 0 | 2 | 13 | 0 | 3 | 0 | 0 | 5 | 23 | +| taeminlee | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | +| kwojtasi | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | +| jankounchained | 14 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 22 | +| tomaarsen | 0 | 2 | 0 | 20 | 0 | 0 | 0 | 0 | 22 | +| crystina-z | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | +| mrshu | 16 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 21 | +| john-b-yang | 0 | 0 | 0 | 0 | 20 | 0 | 0 | 0 | 20 | +| rbroc | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | +| mmhamdy | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | +| ManuelFay | 2 | 0 | 13 | 0 | 0 | 0 | 0 | 5 | 20 | +| AlexeyVatolin | 0 | 0 | 20 | 0 | 0 | 0 | 0 | 0 | 20 | +| Andrian0s | 14 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 20 | +| thakur-nandan | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | +| manandey | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | +| PranjalChitale | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| dipam7 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| sted97 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| Sakshamrzt | 12 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| taidnguyen | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | +| artemsnegirev | 12 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 14 | +| slvnwhrl | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| anpalmak2003 | 9 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 12 | +| Art3mis07 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| guenthermi | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| jordiclive | 2 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 12 | +| xhluca | 6 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 12 | +| henilp105 | 0 | 0 | 2 | 0 | 0 | 9 | 0 | 0 | 11 | +| MariyaTikhonova | 7 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 11 | +| ab1992ao | 8 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 11 | +| tmp_handle | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 0 | 10 | +| swj0419 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| Ruqyai | 2 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| ZhengLiu101 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| Alenush | 6 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 10 | +| ABorghini | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| simon-clematide | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| sarahooker | 0 | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 10 | +| guangyusong | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| HLasse | 0 | 0 | 5 | 0 | 0 | 5 | 0 | 0 | 10 | +| cassanof | 8 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 10 | +| hongjin-su | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| xiamengzhou | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| xu3kev | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| howard-yen | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| malteos | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| ljvmiranda921 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| marcobellagente93 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| izhx | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| MexicanLemonade | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| antoniolanza1996 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | +| achibb | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| NouamaneTazi | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| PhilipMay | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cslizc | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| bakrianoo | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| hanhainebula | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| monikernemo | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | \ No newline at end of file diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 399157757d..678b9a6086 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -143,7 +143,7 @@ def evaluate( hf_subsets = copy(self.hf_subsets) if subsets_to_run is not None: # allow overwrites of pre-filtering - hf_subsets = subsets_to_run + hf_subsets = [s for s in hf_subsets if s in subsets_to_run] for hf_subset in hf_subsets: logger.info( @@ -278,10 +278,6 @@ def _calculate_metrics_from_split( ) -> DescriptiveStatistics: raise NotImplementedError - @property - def metadata_dict(self) -> dict[str, Any]: - return dict(self.metadata) - @property def languages(self) -> list[str]: """Returns the languages of the task""" diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 268fc748cf..50e2b45cc5 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -71,7 +71,7 @@ def load_results( MTEB_EN = Benchmark( - name="MTEB(eng, beta)", + name="MTEB(eng)", tasks=MTEBTasks( get_tasks( tasks=[ @@ -128,7 +128,13 @@ def load_results( get_task("STS22.v2", eval_splits=["test"], hf_subsets=["en"]), ), ), - description="English benchmarks from MTEB", + description="""The new English Massive Text Embedding Benchmark. +This benchmark was created to account for the fact that many models have now been finetuned +to tasks in the original MTEB, and contains tasks that are not as frequently used for model training. +This way the new benchmark and leaderboard can give our users a more realistic expectation of models' generalization performance. + +The original MTEB leaderboard is available under the [MTEB(eng, classic)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%2C+classic%29) tab. + """, citation="", contacts=["KennethEnevoldsen", "Muennighoff"], ) @@ -216,7 +222,12 @@ def load_results( get_task("STS22", eval_splits=["test"], hf_subsets=["en"]), ) ), - description="The original English benchmark by Muennighoff et al., (2023).", + description="""The original English benchmark by Muennighoff et al., (2023). +This page is an adaptation of the [old MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). + +> We recommend that you use [MTEB(eng)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%29) instead, +as many models have been tuned on MTEB(eng, classic) datasets, and MTEB(eng) might give a more accurate representation of models' generalization performance. + """, citation="""@inproceedings{muennighoff-etal-2023-mteb, title = "{MTEB}: Massive Text Embedding Benchmark", author = "Muennighoff, Niklas and @@ -275,7 +286,7 @@ def load_results( "STS22", ], ), - description="Main Russian benchmarks from MTEB", + description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.", reference="https://aclanthology.org/2023.eacl-main.148/", citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb, title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, @@ -324,8 +335,8 @@ def load_results( "LegalQuAD", ] ), - description="Legal benchmarks from MTEB.", - reference="https://aclanthology.org/2023.eacl-main.148/", + description="A benchmark of retrieval tasks in the legal domain.", + reference=None, citation=None, ) @@ -365,7 +376,10 @@ def load_results( "Tatoeba", ] ), - description="BitextMining benchmark from MINERS", + description="""Bitext Mining texts from the MINERS benchmark, a benchmark designed to evaluate the + ability of multilingual LMs in semantic retrieval tasks, + including bitext mining and classification via retrieval-augmented contexts. + """, reference="https://arxiv.org/pdf/2406.07424", citation=""" @article{winata2024miners, @@ -533,7 +547,7 @@ def load_results( ) + (get_task("STS22", eval_splits=["test"], hf_subsets=["fr"]),) ), - description="Main French benchmarks from MTEB", + description="MTEB-French, a French expansion of the original benchmark with high-quality native French datasets.", reference="https://arxiv.org/abs/2405.20468", citation="""@misc{ciancone2024mtebfrenchresourcesfrenchsentence, title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis}, @@ -581,7 +595,7 @@ def load_results( "STS22", ], ), - description="Main German benchmarks from MTEB", + description="A benchmark for text-embedding performance in German.", reference="https://arxiv.org/html/2401.02709v1", citation="""@misc{wehrli2024germantextembeddingclustering, title={German Text Embedding Clustering Benchmark}, @@ -592,6 +606,7 @@ def load_results( primaryClass={cs.CL}, url={https://arxiv.org/abs/2401.02709}, }""", + contacts=["slvnwhrl"], ) @@ -612,7 +627,7 @@ def load_results( "KorSTS", ], ), - description="Main Korean benchmarks from MTEB", + description="A benchmark and leaderboard for evaluation of text embedding in Korean.", reference=None, citation=None, ) @@ -649,7 +664,11 @@ def load_results( ) + (get_task("STS22", eval_splits=["test"], hf_subsets=["pl"]),), ), - description="Main Polish benchmarks from MTEB", + description="""Polish Massive Text Embedding Benchmark (PL-MTEB), a comprehensive benchmark for text embeddings in Polish. The PL-MTEB consists of 28 diverse NLP +tasks from 5 task types. With tasks adapted based on previously used datasets by the Polish +NLP community. In addition, a new PLSC (Polish Library of Science Corpus) dataset was created +consisting of titles and abstracts of scientific publications in Polish, which was used as the basis for +two novel clustering tasks.""", # Rephrased from the abstract reference="https://arxiv.org/abs/2405.10138", citation="""@article{poswiata2024plmteb, title={PL-MTEB: Polish Massive Text Embedding Benchmark}, @@ -657,6 +676,7 @@ def load_results( journal={arXiv preprint arXiv:2405.10138}, year={2024} }""", + contacts=["rafalposwiata"], ) MTEB_code = Benchmark( @@ -693,14 +713,14 @@ def load_results( "typescript", ], ), - description="Main code benchmarks from MTEB", + description="A massive code embedding benchmark covering retrieval tasks in a miriad of popular programming languages.", reference=None, citation=None, ) MTEB_multilingual = Benchmark( - name="MTEB(Multilingual, beta)", + name="MTEB(Multilingual)", tasks=get_tasks( tasks=[ "BornholmBitextMining", @@ -838,10 +858,10 @@ def load_results( "MIRACLRetrievalHardNegatives", ], ), - description="The Multilingual benchmarks from MMTEB. Currently under development.", + description="A large-scale multilingual expansion of MTEB, driven mainly by highly-curated community contributions covering 250+ languages.", reference=None, citation=None, - contacts=["KennethEnevoldsen"], + contacts=["KennethEnevoldsen", "isaac-chung"], ) MTEB_JPN = Benchmark( @@ -873,7 +893,7 @@ def load_results( "ESCIReranking", ], ), - description="Main Japanese benchmarks from MTEB", + description="JMTEB is a benchmark for evaluating Japanese text embedding models.", reference="https://github.com/sbintuitions/JMTEB", citation=None, ) @@ -913,7 +933,7 @@ def load_results( ] MTEB_INDIC = Benchmark( - name="MTEB(Indic, beta)", + name="MTEB(Indic)", tasks=get_tasks( tasks=[ # Bitext @@ -950,10 +970,10 @@ def load_results( languages=indic_languages, exclusive_language_filter=True, ), - description="Main Indic benchmark from MMTEB", + description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.", reference=None, citation=None, - contacts=["KennethEnevoldsen"], + contacts=["KennethEnevoldsen", "isaac-chung"], ) @@ -1001,7 +1021,7 @@ def load_results( ] MTEB_EU = Benchmark( - name="MTEB(Europe, beta)", + name="MTEB(Europe)", tasks=get_tasks( tasks=[ "BornholmBitextMining", @@ -1082,10 +1102,10 @@ def load_results( languages=eu_languages, exclusive_language_filter=True, ), - description="Main European benchmark from MMTEB", + description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.", reference=None, citation=None, - contacts=["KennethEnevoldsen"], + contacts=["KennethEnevoldsen", "isaac-chung"], ) LONG_EMBED = Benchmark( @@ -1100,7 +1120,10 @@ def load_results( "LEMBWikimQARetrieval", ], ), - description="The main benchmark for evaluating long document retrieval.", + description="""LongEmbed is a benchmark oriented at exploring models' performance on long-context retrieval. + The benchmark comprises two synthetic tasks and four carefully chosen real-world tasks, + featuring documents of varying length and dispersed target information. + """, # Pieced together from paper abstract. reference="https://arxiv.org/abs/2404.12096v2", citation="""@article{zhu2024longembed, title={LongEmbed: Extending Embedding Models for Long Context Retrieval}, @@ -1115,7 +1138,13 @@ def load_results( tasks=get_tasks( tasks=["BrightRetrieval"], ), - description="A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.", + description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval. + BRIGHT is the first text retrieval + benchmark that requires intensive reasoning to retrieve relevant documents with + a dataset consisting of 1,384 real-world queries spanning diverse domains, such as + economics, psychology, mathematics, and coding. These queries are drawn from + naturally occurring and carefully curated human data. + """, reference="https://brightbenchmark.github.io/", citation="""@article{su2024bright, title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, @@ -1148,3 +1177,59 @@ def load_results( reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6", citation=None, ) + +C_MTEB = Benchmark( + name="MTEB(Chinese)", + tasks=MTEBTasks( + get_tasks( + tasks=[ + "T2Retrieval", + "MMarcoRetrieval", + "DuRetrieval", + "CovidRetrieval", + "CmedqaRetrieval", + "EcomRetrieval", + "MedicalRetrieval", + "VideoRetrieval", + "T2Reranking", + "MMarcoReranking", + "CMedQAv1-reranking", + "CMedQAv2-reranking", + "Ocnli", + "Cmnli", + "CLSClusteringS2S", + "CLSClusteringP2P", + "ThuNewsClusteringS2S", + "ThuNewsClusteringP2P", + "LCQMC", + "PAWSX", + "AFQMC", + "QBQTC", + "TNews", + "IFlyTek", + "Waimai", + "OnlineShopping", + "JDReview", + ], + ) + + get_tasks(tasks=["MultilingualSentiment"], eval_splits=["test"]) + + get_tasks( + tasks=[ + "ATEC", + "BQ", + "STSB", + ], + eval_splits=["validation"], + ) + ), + description="The Chinese Massive Text Embedding Benchmark (C-MTEB) is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.", + reference="https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB", + citation="""@misc{c-pack, + title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, + author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff}, + year={2023}, + eprint={2309.07597}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +}""", +) diff --git a/mteb/cli.py b/mteb/cli.py index 3c6c821f52..c552394e49 100644 --- a/mteb/cli.py +++ b/mteb/cli.py @@ -374,26 +374,7 @@ def main(): add_create_meta_parser(subparsers) args = parser.parse_args() - - # If no subcommand is provided, default to run with a deprecation warning - if not hasattr(args, "func"): - logger.warning( - "Using `mteb` without a subcommand is deprecated. Use `mteb run` instead.", - DeprecationWarning, - ) - # Set default arguments for 'run' if no subcommand is provided - default_args = parser.parse_args( - ["run"] - + list(map(str, args._get_args())) - + [ - f"--{k}" if v is None else f"--{k}={v}" - for k, v in vars(args).items() - if k != "func" - ] - ) - default_args.func(default_args) - else: - args.func(args) + args.func(args) if __name__ == "__main__": diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 0c07ff34db..377f8b72eb 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -5,28 +5,32 @@ import os import traceback from collections.abc import Iterable -from copy import copy, deepcopy +from copy import deepcopy from datetime import datetime from itertools import chain from pathlib import Path from time import time -from typing import Any +from typing import TYPE_CHECKING, Any import datasets from codecarbon import EmissionsTracker from sentence_transformers import CrossEncoder, SentenceTransformer +import mteb from mteb.abstasks.AbsTask import ScoresDict from mteb.encoder_interface import Encoder from mteb.model_meta import ModelMeta -from mteb.models import model_meta_from_sentence_transformers +from mteb.models import ( + model_meta_from_cross_encoder, + model_meta_from_sentence_transformers, +) from ..abstasks.AbsTask import AbsTask -from ..abstasks.AbsTaskMultilabelClassification import AbsTaskMultilabelClassification -from ..abstasks.AbsTaskReranking import AbsTaskReranking from ..load_results.task_results import TaskResult from ..models.sentence_transformer_wrapper import SentenceTransformerWrapper -from . import LangMapping + +if TYPE_CHECKING: + from mteb.benchmarks import Benchmark logger = logging.getLogger(__name__) @@ -34,124 +38,41 @@ class MTEB: def __init__( self, - tasks: Iterable[str | AbsTask] | None = None, + tasks: Iterable[AbsTask | Benchmark], *, - task_types: list[str] | None = None, - task_categories: list[str] | None = None, - task_langs: list[str] | None = None, - version=None, err_logs_path: str = "error_logs.txt", - **kwargs, ): """Create an Evaluation pipeline, based on the provided tasks. Args: - tasks: List of tasks to be evaluated. - task_types: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task types (Clustering, Retrieval..) to be - evaluated. If None, all tasks will be evaluated - task_categories: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task categories (s2s, p2p..) to be - evaluated. If None, all tasks will be evaluated - task_langs: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of languages to be evaluated. if None, all - languages will be evaluated. ["eng-Latn", "deu_Latn"] will evaluate on all tasks with these languages. - version: Will be deprecated. Version of the benchmark to use. If None, latest is used + tasks: List of tasks or benchmarks to be evaluated, e.g. tasks returned by + `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)"). err_logs_path: Path to save error logs. - kwargs: Additional arguments to be passed to the tasks """ from mteb.benchmarks import Benchmark - self.deprecation_warning( - task_types, task_categories, task_langs, tasks, version - ) - - if tasks is not None: - self._tasks = tasks - if isinstance(tasks[0], Benchmark): - self.benchmarks = tasks - self._tasks = list(chain.from_iterable(tasks)) - assert ( - task_types is None and task_categories is None - ), "Cannot specify both `tasks` and `task_types`/`task_categories`" - else: - self._task_types = task_types - self._task_categories = task_categories - self._tasks = None + self.tasks = list(tasks) + if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark): + self.benchmarks = tasks + self.tasks = list(chain.from_iterable(self.tasks)) - self._task_langs = task_langs if task_langs is not None else [] - if isinstance(self._task_langs, str): - self._task_langs = [self._task_langs] - - self._extend_lang_code() - self._extend_lang_pairs() # add all possible pairs - - self._version = version self.err_logs_path = err_logs_path - self.last_evaluated_splits = {} - self.select_tasks(**kwargs) - - def deprecation_warning( - self, task_types, task_categories, task_langs, tasks, version - ): - if task_types is not None: - logger.warning( - "The `task_types` argument is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(... task_types = [...])` to filter tasks instead." - ) - if task_categories is not None: - logger.warning( - "The `task_categories` argument is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(... categories = [...])` to filter tasks instead." - ) - if task_langs is not None: - logger.warning( - "The `task_langs` argument is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(... languages = [...])` to filter tasks instead. " - + "Note that this uses 3 letter language codes (ISO 639-3)." - ) - if version is not None: - logger.warning( - "The `version` argument is deprecated and will be removed in the next release." - ) - task_contains_strings = any(isinstance(x, str) for x in tasks or []) - if task_contains_strings: - logger.warning( - "Passing task names as strings is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(tasks=[...])` method to get tasks instead." - ) - @property def available_tasks(self): - return [x.metadata.name for x in self.tasks_cls] + return [x.metadata.name for x in self.tasks] @property def available_task_types(self): # sort the task types - return sorted({x.metadata.type for x in self.tasks_cls}) + return sorted({x.metadata.type for x in self.tasks}) @property def available_task_categories(self): - return {x.metadata.category for x in self.tasks_cls} - - def _extend_lang_code(self): - # add all possible language codes - for lang in set(self._task_langs): - if lang in LangMapping.LANG_MAPPING: - self._task_langs += LangMapping.LANG_MAPPING[lang] - - def _extend_lang_pairs(self): - # add all possible language pairs - langs = set(self._task_langs) - for x in langs: - if "-" not in x: - for y in langs: - if "-" not in y: - pair = f"{x}-{y}" - if pair not in langs: - self._task_langs.append(pair) - return - - def _display_tasks(self, task_list, name=None): + return {x.metadata.category for x in self.tasks} + + def _display_tasks(self, task_list: Iterable[AbsTask], name: str | None = None): from rich.console import Console # disable logging for other ranks @@ -215,80 +136,14 @@ def mteb_benchmarks(self): @classmethod def mteb_tasks(cls): """Get all tasks available in the MTEB.""" - instance = cls() - instance._display_tasks(instance.tasks_cls, name="MTEB tasks") + tasks = mteb.get_tasks() + instance = cls(tasks) + instance._display_tasks(tasks, name="MTEB tasks") def print_selected_tasks(self): """Print the selected tasks.""" self._display_tasks(self.tasks, name="Selected tasks") - def select_tasks(self, **kwargs): - """Select the tasks to be evaluated.""" - # Get all existing tasks - # reranking and multiclassClassification subclasses retrieval to share methods, but is an abstract task - tasks_categories_cls = list(AbsTask.__subclasses__()) + [ - AbsTaskReranking, - AbsTaskMultilabelClassification, - ] - all_task_classes = [] - for cat_cls in tasks_categories_cls: - for cls in cat_cls.__subclasses__(): - if cat_cls.__name__.startswith("AbsTask") and cls.__name__ not in ( - "AbsTaskReranking", - "AbsTaskMultilabelClassification", - ): - task = cls(hf_subsets=self._task_langs, **kwargs) - all_task_classes.append(task) - - self.tasks_cls = all_task_classes - - # If `task_list` is specified, select list of tasks - if self._tasks is not None: - self.tasks = list( - filter(lambda x: (x.metadata.name in self._tasks), self.tasks_cls) - ) - if len(self.tasks) != len(self._tasks): - tasks_known = {x.metadata.name for x in self.tasks_cls} - tasks_unknown = { - x for x in self._tasks if isinstance(x, str) - } - tasks_known - if tasks_unknown: - unknown_str, known_str = ( - ",".join(sorted(tasks_unknown)), - ",".join(sorted(tasks_known)), - ) - logger.warning( - f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}." - ) - # add task if subclass of mteb.tasks - self.tasks.extend([x for x in self._tasks if isinstance(x, AbsTask)]) - return - - # Otherwise use filters to select tasks - filtered_tasks = filter( - lambda x: (self._task_types is None) - or (x.metadata.type in self._task_types), - self.tasks_cls, - ) - filtered_tasks = filter( - lambda x: (self._task_categories is None) - or (x.metadata.category in self._task_categories), - filtered_tasks, - ) - filtered_tasks = filter( - lambda x: (self._version is None) or (x.metadata.version >= self._version), - filtered_tasks, - ) - # keep only tasks with at least one language in the filter - filtered_tasks = filter( - lambda x: (not self._task_langs) - or (len(set(x.metadata.eval_langs) & set(self._task_langs)) > 0), - filtered_tasks, - ) - - # Get final list of tasks - self.tasks = list(filtered_tasks) - def load_tasks_data(self): """Load datasets for the selected tasks.""" logger.info(f"\n\n## Loading datasets for {len(self.tasks)} tasks") @@ -416,13 +271,6 @@ def run( Returns: A list of TaskResult objects, one for each task evaluated. """ - if "batch_size" in kwargs: - logger.warning( - "The `batch_size` argument is deprecated and will be removed in the next release. " - + "Please use `encode_kwargs = {'batch_size': ...}` to set the batch size instead." - ) - encode_kwargs["batch_size"] = kwargs["batch_size"] - # update logging to account for different levels of Verbosity (similar to the command line) if verbosity == 0: @@ -455,8 +303,8 @@ def run( self.print_selected_tasks() evaluation_results = [] - original_tasks = ( - self.tasks.copy() + original_tasks = deepcopy( + self.tasks ) # save them in case we re-use the object (e.g. for reranking) # To evaluate missing splits, we keep track of the task name and the corresponding splits. @@ -650,7 +498,7 @@ def create_model_meta(model: Encoder) -> ModelMeta: meta = model.mteb_model_meta # type: ignore else: try: - meta = model_meta_from_sentence_transformers(model) # type: ignore + meta = MTEB._get_model_meta(model) except AttributeError: logger.warning( "Could not find model metadata. Please set the model.mteb_model_meta attribute or if you are using " @@ -665,7 +513,7 @@ def create_model_meta(model: Encoder) -> ModelMeta: ) # create a copy of the meta to avoid modifying the original object - meta = copy(meta) + meta = deepcopy(meta) meta.revision = meta.revision or "no_revision_available" meta.name = meta.name or "no_model_name_available" @@ -752,3 +600,11 @@ def _get_missing_evaluations( missing_evaluations[split]["missing_subsets"] = missing_subsets return missing_evaluations + + @staticmethod + def _get_model_meta(model: Encoder) -> ModelMeta: + if isinstance(model, CrossEncoder): + meta = model_meta_from_cross_encoder(model) + else: + meta = model_meta_from_sentence_transformers(model) + return meta diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 83726b8453..fc96ed23f6 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -82,6 +82,7 @@ def __call__( self.top_k, task_name=self.task_name, # type: ignore instructions=instructions, + score_function="bm25", **kwargs, ) else: diff --git a/mteb/evaluation/evaluators/model_classes.py b/mteb/evaluation/evaluators/model_classes.py index b3c5c77894..8aecc6fa6d 100644 --- a/mteb/evaluation/evaluators/model_classes.py +++ b/mteb/evaluation/evaluators/model_classes.py @@ -330,9 +330,19 @@ def _full_corpus_search( query_embeddings = torch.as_tensor(query_embeddings).to(device) sub_corpus_embeddings = torch.as_tensor(sub_corpus_embeddings).to(device) - score_function = ( - self.model.similarity if hasattr(self.model, "similarity") else cos_sim - ) + if hasattr(self.model.model, "mteb_model_meta") or hasattr( + self.model, "similarity" + ): + score_function = ( + self.model.similarity + if hasattr(self.model, "similarity") + else self.model.model.mteb_model_meta.get_similarity_function() + ) + else: + logger.warning( + "The model does not provide `mteb_model_meta`; defaulting to the cosine similarity function." + ) + score_function = cos_sim with torch.inference_mode(): scores = score_function(query_embeddings, sub_corpus_embeddings) diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py index e01e0ec463..14ca673ce9 100644 --- a/mteb/evaluation/evaluators/utils.py +++ b/mteb/evaluation/evaluators/utils.py @@ -70,6 +70,34 @@ def _cos_sim_core(a_tensor, b_tensor): return _cos_sim_core(a, b) +def max_sim(a: np.ndarray, b: np.ndarray) -> np.ndarray: + """Computes the max-similarity max_sim(a[i], b[j]) for all i and j. + Works with a Tensor of the shape (batch_size, num_tokens, token_dim) + + Return: + Matrix with res[i][j] = max_sim(a[i], b[j]) + """ # noqa: D402 + if not isinstance(a, torch.Tensor): + a = torch.tensor(a, dtype=torch.float32) + + if not isinstance(b, torch.Tensor): + b = torch.tensor(b, dtype=torch.float32) + + if len(a.shape) == 2: + a = a.unsqueeze(0) + + if len(b.shape) == 2: + b = b.unsqueeze(0) + + scores = torch.einsum( + "ash,bth->abst", + a, + b, + ) + + return scores.max(axis=-1).values.sum(axis=-1) + + def dot_score(a: torch.Tensor, b: torch.Tensor): """Computes the dot-product dot_prod(a[i], b[j]) for all i and j. :return: Matrix with res[i][j] = dot_prod(a[i], b[j]) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index d1383cf1a7..5ee5a6b9da 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -6,6 +6,7 @@ import tempfile import time from pathlib import Path +from typing import Literal from urllib.parse import urlencode import gradio as gr @@ -24,7 +25,9 @@ def load_results(): results_cache_path = Path(__file__).parent.joinpath("__cached_results.json") if not results_cache_path.exists(): all_results = ( - mteb.load_results(only_main_score=True).join_revisions().filter_models() + mteb.load_results(only_main_score=True, require_model_meta=False) + .join_revisions() + .filter_models() ) all_results.to_disk(results_cache_path) return all_results @@ -46,9 +49,12 @@ def produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str: return md +DEFAULT_BENCHMARK_NAME = "MTEB(Multilingual)" + + def set_benchmark_on_load(request: gr.Request): query_params = request.query_params - return query_params.get("benchmark_name", "MTEB(Multilingual, beta)") + return query_params.get("benchmark_name", DEFAULT_BENCHMARK_NAME) def download_table(table: pd.DataFrame) -> Path: @@ -115,23 +121,75 @@ def update_task_info(task_names: str) -> gr.DataFrame: return gr.DataFrame(df, datatype=["markdown"] + ["str"] * (len(df.columns) - 1)) +# Model sizes in million parameters +MIN_MODEL_SIZE, MAX_MODEL_SIZE = 0, 10_000 + + +def filter_models( + model_names, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot_setting, +): + lower, upper = model_size + # Setting to None, when the user doesn't specify anything + if (lower == MIN_MODEL_SIZE) and (upper == MAX_MODEL_SIZE): + lower, upper = None, None + else: + # Multiplying by millions + lower = lower * 1e6 + upper = upper * 1e6 + model_metas = mteb.get_model_metas( + model_names=model_names, + open_weights=availability, + use_instructions=instructions, + frameworks=compatibility, + n_parameters_range=(lower, upper), + ) + tasks = mteb.get_tasks(tasks=task_select) + models_to_keep = set() + for model_meta in model_metas: + is_model_zero_shot = model_meta.is_zero_shot_on(tasks) + if is_model_zero_shot is None: + if zero_shot_setting == "hard": + continue + elif not is_model_zero_shot: + if zero_shot_setting != "off": + continue + models_to_keep.add(model_meta.name) + return list(models_to_keep) + + logger.info("Loading all benchmark results") all_results = load_results() -# Model sizes in million parameters -min_model_size, max_model_size = 0, 10_000 - benchmarks = mteb.get_benchmarks() all_benchmark_results = { benchmark.name: benchmark.load_results(base_results=all_results) for benchmark in benchmarks } -default_benchmark = mteb.get_benchmark("MTEB(Multilingual, beta)") +default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME) default_results = all_benchmark_results[default_benchmark.name] logger.info("Benchmark results loaded") default_scores = default_results.get_scores(format="long") -summary_table, per_task_table = scores_to_tables(default_scores) +all_models = list({entry["model_name"] for entry in default_scores}) +filtered_models = filter_models( + all_models, + default_results.task_names, + availability=None, + compatibility=[], + instructions=None, + model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + zero_shot_setting="soft", +) + +summary_table, per_task_table = scores_to_tables( + [entry for entry in default_scores if entry["model_name"] in filtered_models] +) benchmark_select = gr.Dropdown( [bench.name for bench in benchmarks], @@ -205,7 +263,7 @@ def update_task_info(task_names: str) -> gr.DataFrame: with gr.Row(): searchbar = gr.Textbox( label="Search Models", - info="Search models by name (RegEx sensitive. Separate queries with `|`)", + info="Press Enter to search.\nSearch models by name (RegEx sensitive. Separate queries with `|`)", interactive=True, ) compatibility = gr.CheckboxGroup( @@ -256,14 +314,14 @@ def update_task_info(task_names: str) -> gr.DataFrame: interactive=True, ) model_size = RangeSlider( - minimum=min_model_size, - maximum=max_model_size, - value=(min_model_size, max_model_size), + minimum=MIN_MODEL_SIZE, + maximum=MAX_MODEL_SIZE, + value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), label="Model Size (#M Parameters)", interactive=True, ) scores = gr.State(default_scores) - models = gr.State(list({entry["model_name"] for entry in default_scores})) + models = gr.State(filtered_models) with gr.Row(): with gr.Column(): description = gr.Markdown( @@ -293,6 +351,10 @@ def update_task_info(task_names: str) -> gr.DataFrame: """ ) summary_table.render() + download_summary = gr.DownloadButton("Download Table") + download_summary.click( + download_table, inputs=[summary_table], outputs=[download_summary] + ) with gr.Accordion( "What do aggregate measures (Rank(Borda), Mean(Task), etc.) mean?", open=False, @@ -306,10 +368,19 @@ def update_task_info(task_names: str) -> gr.DataFrame: **Mean(TaskType)**: This is a weighted average across different task categories, such as classification or retrieval. It is computed by first computing the average by task category and then computing the average on each category. Similar to the Mean(Task) this measure is continuous and tends to overvalue tasks with higher variance. This score also prefers models that perform well across all task categories. """ ) - download_summary = gr.DownloadButton("Download Table") - download_summary.click( - download_table, inputs=[summary_table], outputs=[download_summary] - ) + with gr.Accordion( + "What does zero-shot mean?", + open=False, + ): + gr.Markdown( + """ +A model is considered zero-shot if it is not trained on any splits of the datasets used to derive the tasks. +E.g., if a model is trained on Natural Questions, it cannot be considered zero-shot on benchmarks containing the task โ€œNQโ€ which is derived from Natural Questions. +This definition creates a few edge cases. For instance, multiple models are typically trained on Wikipedia title and body pairs, but we do not define this as leakage on, e.g., โ€œWikipediaRetrievalMultilingualโ€ and โ€œWikiClusteringP2Pโ€ as these datasets are not based on title-body pairs. +Distilled, further fine-tunes or in other ways, derivative models inherit the datasets of their parent models. +Based on community feedback and research findings, This definition could change in the future. + """ + ) with gr.Tab("Performance per task"): per_task_table.render() download_per_task = gr.DownloadButton("Download Table") @@ -403,51 +474,14 @@ def update_task_list(benchmark_name, type_select, domain_select, lang_select): outputs=[task_select], ) - def filter_models( - model_names, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ): - lower, upper = model_size - # Setting to None, when the user doesn't specify anything - if (lower == min_model_size) and (upper == max_model_size): - lower, upper = None, None - else: - # Multiplying by millions - lower = lower * 1e6 - upper = upper * 1e6 - model_metas = mteb.get_model_metas( - model_names=model_names, - open_weights=availability, - use_instructions=instructions, - frameworks=compatibility, - n_parameters_range=(lower, upper), - ) - tasks = mteb.get_tasks(tasks=task_select) - models_to_keep = set() - for model_meta in model_metas: - is_zero_shot = model_meta.is_zero_shot_on(tasks) - if is_zero_shot is None: - if zero_shot == "hard": - continue - if not zero_shot: - if zero_shot != "off": - continue - models_to_keep.add(model_meta.name) - return list(models_to_keep) - def update_models( - scores, - tasks, - availability, - compatibility, - instructions, - model_size, - zero_shot, + scores: list[dict], + tasks: list[str], + availability: bool | None, + compatibility: list[str], + instructions: bool | None, + model_size: tuple[int, int], + zero_shot: Literal["hard", "soft", "off"], ): start_time = time.time() model_names = list({entry["model_name"] for entry in scores}) @@ -458,7 +492,7 @@ def update_models( compatibility, instructions, model_size, - zero_shot, + zero_shot_setting=zero_shot, ) elapsed = time.time() - start_time logger.info(f"update_models callback: {elapsed}s") @@ -542,7 +576,7 @@ def update_models( ], outputs=[models], ) - zero_shot.input( + zero_shot.change( update_models, inputs=[ scores, @@ -592,7 +626,7 @@ def update_tables( inputs=[scores, searchbar, task_select, models], outputs=[summary_table, per_task_table], ) - searchbar.input( + searchbar.submit( update_tables, inputs=[scores, searchbar, task_select, models], outputs=[summary_table, per_task_table], diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 9a1dc57994..ef28392cf7 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -88,7 +88,7 @@ def get_means_per_types(per_task: pd.DataFrame): dict( model_name=model_name, task_type=task_type, - score=scores[tasks].mean(), + score=scores[tasks].mean(skipna=False), ) ) return pd.DataFrame.from_records(records) @@ -142,6 +142,11 @@ def scores_to_tables( names = per_task.index.get_level_values("model_name") names = pd.Series(names, index=per_task.index) to_remove |= ~names.str.contains(search_query, regex=True) + if to_remove.all(): + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return gr.DataFrame(no_results_frame), gr.DataFrame(no_results_frame) models_to_remove = list(per_task[to_remove].index) typed_mean = mean_per_type.mean(skipna=False, axis=1) overall_mean = per_task.mean(skipna=False, axis=1) @@ -218,7 +223,11 @@ def scores_to_tables( joint_table[score_columns] = joint_table[score_columns].map(format_scores) joint_table_style = ( joint_table.style.format( - {**{column: "{:.2f}" for column in score_columns}, "Rank (Borda)": "{:.0f}"} + { + **{column: "{:.2f}" for column in score_columns}, + "Rank (Borda)": "{:.0f}", + }, + na_rep="", ) .highlight_min("Rank (Borda)", props="font-weight: bold") .highlight_max(subset=score_columns, props="font-weight: bold") @@ -226,7 +235,7 @@ def scores_to_tables( task_score_columns = per_task.select_dtypes("number").columns per_task[task_score_columns] *= 100 per_task_style = per_task.style.format( - "{:.2f}", subset=task_score_columns + "{:.2f}", subset=task_score_columns, na_rep="" ).highlight_max(subset=task_score_columns, props="font-weight: bold") return ( gr.DataFrame( diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index 015a96d337..e1632a3dec 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -260,8 +260,16 @@ def parse_version(version_str: str) -> Version | None: def keep_best(group: pd.DataFrame) -> pd.DataFrame: is_main_revision = group["revision"] == group["main_revision"] - if is_main_revision.sum() == 1: - return group[is_main_revision] + # If the main revision is present we select that + if is_main_revision.sum() > 0: + return group[is_main_revision].head(n=1) + unique_revisions = group["revision"].unique() + # Filtering out no_revision_available if other revisions are present + if (len(unique_revisions) > 1) and ( + "no_revision_available" in unique_revisions + ): + group = group[group["revision"] != "no_revision_available"] + # If there are any not-NA mteb versions, we select the latest one if group["mteb_version"].notna().any(): group = group.dropna(subset=["mteb_version"]) group = group.sort_values("mteb_version", ascending=False) diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py index 03ec6fb308..ef851a1dc2 100644 --- a/mteb/load_results/load_results.py +++ b/mteb/load_results/load_results.py @@ -139,6 +139,7 @@ def load_results( continue model_name, revision = model_name_and_revision + model_name = model_name.replace("__", "/") if models_to_keep is not None and model_name not in models_to_keep: continue elif models_to_keep is not None and models_to_keep[model_name] is not None: diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index 7cdadd7ac0..d940202499 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -390,15 +390,16 @@ def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult: main_score = task.metadata.main_score for split, split_score in scores.items(): for hf_subset, hf_subset_scores in split_score.items(): - if task.metadata.type == "STS": - for name, prev_name in [ - ("cosine", "cos_sim"), - ("manhattan", "manhattan"), - ("euclidean", "euclidean"), - ]: - prev_name_scores = hf_subset_scores.pop( - prev_name, {"spearman": "NaN"} - ) + for name, prev_name in [ + ("cosine", "cos_sim"), + ("manhattan", "manhattan"), + ("euclidean", "euclidean"), + ("dot", "dot"), + ("max", "max"), + ("similarity", "similarity"), + ]: + prev_name_scores = hf_subset_scores.pop(prev_name, None) + if prev_name_scores is not None: for k, v in prev_name_scores.items(): hf_subset_scores[f"{name}_{k}"] = v diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 1754ab4bbb..b0dbccf24e 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -4,11 +4,13 @@ from functools import partial from typing import TYPE_CHECKING, Any, Callable, Literal +import numpy as np from pydantic import BaseModel, ConfigDict from mteb.abstasks.AbsTask import AbsTask from mteb.abstasks.TaskMetadata import STR_DATE, STR_URL from mteb.encoder_interface import Encoder +from mteb.evaluation.evaluators.utils import cos_sim, dot_score, max_sim from .languages import ISO_LANGUAGE_SCRIPT @@ -30,7 +32,7 @@ "PyLate", "ColBERT", ] -DISTANCE_METRICS = Literal["cosine", "max_sim", "dot"] +DISTANCE_METRICS = Literal["cosine", "MaxSim", "dot"] def sentence_transformers_loader( @@ -66,8 +68,8 @@ class ModelMeta(BaseModel): release_date: The date the model's revision was released. license: The license under which the model is released. Required if open_weights is True. open_weights: Whether the model is open source or proprietary. - public_training_data: Whether the training data used to train the model is publicly available. - public_training_code: Whether the code used to train the model is publicly available. + public_training_code: A link to the publicly available training code. If none it is assumed that the training code is not publicly available. + public_training_data: A link to the publicly available training data. If none it is assumed that the training data is not publicly available. similarity_fn_name: The distance metric used by the model. framework: The framework the model is implemented in, can be a list of frameworks e.g. `["Sentence Transformers", "PyTorch"]`. reference: A URL to the model's page on huggingface or another source. @@ -90,22 +92,32 @@ class ModelMeta(BaseModel): release_date: STR_DATE | None languages: list[ISO_LANGUAGE_SCRIPT] | None loader: Callable[..., Encoder] | None = None - n_parameters: int | None = None - max_tokens: float | None = None - embed_dim: int | None = None - license: str | None = None - open_weights: bool | None = None - public_training_data: bool | None = None - public_training_code: bool | None = None - framework: list[FRAMEWORKS] = [] + n_parameters: int | None + max_tokens: float | None + embed_dim: int | None + license: str | None + open_weights: bool | None + public_training_code: str | None + public_training_data: str | bool | None + framework: list[FRAMEWORKS] reference: STR_URL | None = None - similarity_fn_name: DISTANCE_METRICS | None = None - use_instructions: bool | None = None - training_datasets: dict[str, list[str]] | None = None + similarity_fn_name: DISTANCE_METRICS | None + use_instructions: bool | None + training_datasets: dict[str, list[str]] | None adapted_from: str | None = None superseded_by: str | None = None citation: str | None = None + def get_similarity_function(self) -> Callable[[np.ndarray, np.ndarray], np.ndarray]: + if self.similarity_fn_name == "cosine": + return cos_sim + elif self.similarity_fn_name == "dot": + return dot_score + elif self.similarity_fn_name == "MaxSim": + return max_sim + elif self.similarity_fn_name is None: + raise ValueError("Similarity function not specified.") + def to_dict(self): dict_repr = self.model_dump() loader = dict_repr.pop("loader", None) diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index 1c70b528ce..1389e23982 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -6,6 +6,7 @@ get_model, get_model_meta, get_model_metas, + model_meta_from_cross_encoder, model_meta_from_sentence_transformers, ) from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper @@ -17,5 +18,6 @@ "get_model_meta", "get_model_metas", "model_meta_from_sentence_transformers", + "model_meta_from_cross_encoder", "SentenceTransformerWrapper", ] diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index 3ee3267999..dd3cd1c8df 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -115,6 +115,32 @@ primaryClass={cs.LG}, url={https://arxiv.org/abs/2407.18887}, }""", + public_training_code=None, + public_training_data=None, + training_datasets={ + # source: https://arxiv.org/pdf/2405.05374 + # splits not specified to assuming everything + # in MTEB + "NQ": ["test"], + "NQHardNegatives": ["test"], + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # not in MTEB + # trained on stack exchange (title-body) + # "stackexchange": [], + # potentially means that: + # "StackExchangeClusteringP2P": ["test"], + # "StackExchangeClusteringP2P.v2": ["test"], + # "StackExchangeClustering": ["test"], + # "StackExchangeClustering.v2": ["test"], + # not in MTEB + # "paq": [], + # "s2orc": [], + # "other": [], # undisclosed including webdata + }, # also use synthetic ) @@ -139,8 +165,8 @@ use_instructions=True, adapted_from="sentence-transformers/all-MiniLM-L6-v2", superseded_by=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, + public_training_data=None, training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -189,8 +215,8 @@ use_instructions=True, adapted_from="intfloat/e5-small-unsupervised", superseded_by=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -239,8 +265,8 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5", - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -289,8 +315,8 @@ use_instructions=True, adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -339,8 +365,8 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0", - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -391,6 +417,9 @@ use_instructions=True, adapted_from=None, superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", + public_training_code=None, + public_training_data=None, + training_datasets=None, ) arctic_embed_m_v2_0 = ModelMeta( @@ -415,8 +444,8 @@ use_instructions=True, adapted_from="Alibaba-NLP/gte-multilingual-base", superseded_by=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -464,8 +493,8 @@ use_instructions=True, adapted_from="BAAI/bge-m3-retromae", superseded_by=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index fcbe14d07b..91ff256bb8 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -4,6 +4,8 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader +from .e5_instruct import E5_MISTRAL_TRAINING_DATA + model_prompts = {"query": "Represent this sentence for searching relevant passages: "} BGE_15_CITATION = """@misc{bge_embedding, title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, @@ -13,6 +15,289 @@ archivePrefix={arXiv}, primaryClass={cs.CL} }""" +model_prompts_zh = {"query": "ไธบ่ฟ™ไธชๅฅๅญ็”Ÿๆˆ่กจ็คบไปฅ็”จไบŽๆฃ€็ดข็›ธๅ…ณๆ–‡็ซ ๏ผš"} + +bge_m3_training_data = { + # source: https://arxiv.org/abs/2402.03216 + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "LeCaRDv2": ["train"], + "CMedQAv1-reranking": ["train"], + "CMedQAv2-reranking": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CodeSearchNet": ["train"], + # not in mteb + # "s2orc" + # Wikipedia + # "xP3" + # "mC4" + # "CC-News" + # "MTP" + # "NLLB" + # "CCMatrix" + # TriviaQA + # COL-IEE + # PubMedQA + # SQuAD + # SimCSE + # mMARCO-ZH + # LawGPT + # NLI-zh2, LeCaRDv2, + # NLI, MultiLongDoc (their syntetic) + # + synthetic data +} + +bge_training_data = { + # source: https://data.baai.ac.cn/details/BAAI-MTP + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], # assumed from: amazon_reviews_multi + "MLQARetrieval": [ + "validation", + "test", + ], # assumed from mlqa (question, context) + # not in mteb + # Dataset Pairs + # wudao (title, passage) + # cmrc2018 (query, context) + # dureader (query, context) + # simclue (sentence_a, sentence_b) + # csl (title, abstract) + # amazon_reviews_multi (title, body) + # wiki_atomic_edits (base_sentence, edited_sentence) + # mlqa (question, context) + # xlsum (title, summary) (title, text) + # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further + # "wikipedia": [], # title + section title, passage + # "reddit": [], # title, body + # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) + # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) +} + +bge_chinese_training_data = { + # source: https://arxiv.org/pdf/2309.07597 + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CMedQAv2-reranking": ["train"], + "Cmnli": ["train"], + "Ocnli": ["train"], + # not in mteb + # - multi-cpr + # - NLI-zh + # Dataset Pairs + # wudao (title, passage) + # cmrc2018 (query, context) + # dureader (query, context) + # simclue (sentence_a, sentence_b) + # csl (title, abstract) + # amazon_reviews_multi (title, body) + # wiki_atomic_edits (base_sentence, edited_sentence) + # mlqa (question, context) + # xlsum (title, summary) (title, text) + # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further + # "wikipedia": [], # title + section title, passage + # "reddit": [], # title, body + # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) + # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) +} + +# https://huggingface.co/BAAI/bge-m3/discussions/29 +bgem3_languages = [ + "afr_Latn", # af + # als + "amh_Ethi", # am + # an + # ar + "azj_Latn", # arz + # as + "ast_Latn", # ast + # av + # az + "azj_Latn", # azb + # ba + # bar + # bcl + "ben_Beng", # be + "bul_Cyrl", # bg + # bh + # bn + # bo + "bel_Cyrl", # bpy + # br + # bs + # bxr + "cat_Latn", # ca + # cbk + # ce + "ceb_Latn", # ceb + "ckb_Arab", # ckb + # co + # cs + # cv + # cy + "dan_Latn", # da + "deu_Latn", # de + # diq + # dsb + # dty + # dv + "ell_Grek", # el + # eml + "eng_Latn", # en + # eo + "est_Latn", # es + # et + # eu + # fa + "fin_Latn", # fi + "fra_Latn", # fr + # fy + # ga + # gd + "glg_Latn", # gl + # gn + # gom + "guj_Gujr", # gu + # gv + "heb_Hebr", # he + "hin_Deva", # hi + # hif + # hr + # hsb + # ht + # hu + # hy + # ia + # id + # ie + # ilo + # io + # is + "ita_Latn", # it + "jpn_Jpan", # ja + # jbo + # jv + # ka + # kk + # km + # kn + "kor_Hang", # ko + # krc + # ku + # kv + # kw + # ky + # la + # lb + # lez + # li + # lmo + # lo + # lt + # lv + # mai + # mg + # mhr + # min + # mk + # ml + # mn + # mr + # mrj + # ms + # mt + # mwl + # my + # myv + # mzn + # nah + # nap + # nds + # ne + # new + # nl + # nn + # no + # oc + # or + # os + # pa + # pam + # pfl + # pl + # pms + # pnb + # ps + # pt + # qu + # rm + # ro + "rus_Cyrl", # ru + # sa + # sah + # sc + # scn + # sco + # sd + # sh + # si + # sk + # sl + # so + # sq + # sr + # su + # sv + # sw + # ta + # te + # tg + "tha_Thai", # th + # tk + # tl + # tr + # tt + # tyv + # ug + "ukr_Cyrl", # uk + # ur + # uz + # vec + # vep + # vi + # vls + # vo + # wa + # war + # wuu + # xal + # xmf + # yi + # yo + # yue + "zho_Hans", # zh +] + bge_small_en_v1_5 = ModelMeta( loader=partial( # type: ignore @@ -34,38 +319,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", + training_datasets=bge_training_data, citation=BGE_15_CITATION, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP - public_training_code=None, # seemingly released (at least for some models, but the link is broken - training_datasets={ - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) - }, ) bge_base_en_v1_5 = ModelMeta( @@ -88,38 +345,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - citation=BGE_15_CITATION, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken - training_datasets={ - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) - }, + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", + training_datasets=bge_training_data, + citation=BGE_15_CITATION, ) bge_large_en_v1_5 = ModelMeta( @@ -143,35 +372,220 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, citation=BGE_15_CITATION, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", + training_datasets=bge_training_data, +) + +bge_small_zh_v1_5 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-small-zh-v1.5", + revision="7999e1d3359715c523056ef9478215996d62a620", + model_prompts=model_prompts_zh, + ), + name="BAAI/bge-small-zh-v1.5", + languages=["zho_Hans"], + open_weights=True, + revision="7999e1d3359715c523056ef9478215996d62a620", + release_date="2023-09-12", # initial commit of hf model. + n_parameters=24_000_000, + embed_dim=512, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-small-zh-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=bge_chinese_training_data, +) + +bge_base_zh_v1_5 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-base-zh-v1.5", + revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65", + model_prompts=model_prompts_zh, + ), + name="BAAI/bge-base-zh-v1.5", + languages=["zho_Hans"], + open_weights=True, + revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65", + release_date="2023-09-11", # initial commit of hf model. + n_parameters=438_000_000, + embed_dim=768, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-base-zh-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=bge_chinese_training_data, +) + +bge_large_zh_v1_5 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-large-zh-v1.5", + revision="79e7739b6ab944e86d6171e44d24c997fc1e0116", + model_prompts=model_prompts_zh, + ), + name="BAAI/bge-large-zh-v1.5", + languages=["zho_Hans"], + open_weights=True, + revision="79e7739b6ab944e86d6171e44d24c997fc1e0116", + release_date="2023-09-12", # initial commit of hf model. + n_parameters=1_340_000_000, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-large-zh-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=bge_chinese_training_data, +) + +bge_m3 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-m3", + revision="5617a9f61b028005a4858fdac845db406aefb181", + ), + name="BAAI/bge-m3", + languages=bgem3_languages, + open_weights=True, + revision="5617a9f61b028005a4858fdac845db406aefb181", + release_date="2024-06-28", + n_parameters=568_000_000, + embed_dim=4096, + license="mit", + max_tokens=8194, + reference="https://huggingface.co/BAAI/bge-m3", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_m3_training_data, +) + + +bge_multilingual_gemma2 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-multilingual-gemma2", + revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", + ), + name="BAAI/bge-multilingual-gemma2", + languages=[ + "eng_Latn", + "zho_Hans", + "kor_Hang", + "kor_Latn", + "fra_Latn", + "jpn_Jpan", + "jpn_Latn", + ], # This list is incomlete. Their description says "and more". + # I'm also unsure about the scripts. + open_weights=True, + revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", + release_date="2024-07-25", # initial commit of hf model. + n_parameters=9.24 * 1e9, + embed_dim=3584, # from old C-MTEB leaderboard + license="gemma", + max_tokens=8192, # from old C-MTEB leaderboard + reference="https://huggingface.co/BAAI/bge-multilingual-gemma2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, # not disclosed +) + +# Contents of cfli/bge-full-data +bge_full_data = { + # source: https://arxiv.org/pdf/2409.15700 + # Charles Goodhart is turning back and forth + # in his grave as I'm annotating this + # |Retrieval| + # ELI5 + # SQuaD + # TriviaQA + # QuoraDuplicateQuestions + "HotpotQA": ["train"], + "FEVER": ["train"], + "MSMARCO": ["train"], + "NQ": ["train"], + "ArguAna": ["train"], + "FiQA2018": ["train"], + # |Reranking| + "SciDocsReranking": ["train"], + "StackOverflowDupQuestions": ["train"], + # |Classification| + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ImdbClassification": ["train"], + "ToxicConversationsClassification": ["train"], + # |Clustering| + "ArxivClusteringS2S": ["train"], + "ArxivClusteringP2P": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringP2P": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringP2P": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "RedditClusteringP2P": ["train"], + "RedditClustering": ["train"], + "RedditClustering.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + # |STS| + "STS22": ["train"], + "STS22.v2": ["train"], + "STSBenchmark": ["train"], +} + +bge_en_icl = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="BAAI/bge-en-icl", + revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5", + ), + name="BAAI/bge-en-icl", + languages=[ + "eng_Latn", + ], + open_weights=True, + revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5", + release_date="2024-07-25", # initial commit of hf model. + n_parameters=7.11 * 1e9, + embed_dim=4096, + license="apache-2", + max_tokens=32768, + reference="https://huggingface.co/BAAI/bge-en-icl", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code="https://github.com/FlagOpen/FlagEmbedding", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", training_datasets={ - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) + **E5_MISTRAL_TRAINING_DATA, + **bge_full_data, }, + adapted_from="intfloat/e5-mistral-7b-instruct", ) diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py index e15ef83b9c..757081f18e 100644 --- a/mteb/models/bm25.py +++ b/mteb/models/bm25.py @@ -125,8 +125,11 @@ def encode(self, texts: list[str], **kwargs): embed_dim=None, license=None, max_tokens=None, - reference=None, + reference="https://github.com/xhluca/bm25s", similarity_fn_name=None, framework=[], use_instructions=False, + public_training_code="https://github.com/xhluca/bm25s", + public_training_data=None, + training_datasets=None, ) diff --git a/mteb/models/cde_models.py b/mteb/models/cde_models.py new file mode 100644 index 0000000000..78870ef129 --- /dev/null +++ b/mteb/models/cde_models.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import logging + +from mteb.model_meta import ModelMeta + +from .bge_models import bge_full_data + +logger = logging.getLogger(__name__) + + +cde_small_v1 = ModelMeta( + loader=None, # I will leave this at None for now, + name="jxm/cde-small-v1", + languages=["eng_Latn"], + open_weights=True, + revision="8d5736163718a8b65cd787b75ed61020d18bad3c", + release_date="2024-09-24", + n_parameters=int(281 * 1e6), # Though the second-stage model is only 140M + max_tokens=512, + embed_dim=768, + license="mit", + similarity_fn_name="cosine", + framework=["Sentence Transformers"], + reference="https://huggingface.co/jxm/cde-small-v1", + use_instructions=True, + adapted_from="nomic-ai/nomic-bert-2048", + superseded_by="jxm/cde-small-v2", + training_datasets=bge_full_data, + public_training_code="https://github.com/jxmorris12/cde", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", +) + +cde_small_v2 = ModelMeta( + loader=None, # I will leave this at None for now, + name="jxm/cde-small-v2", + languages=["eng_Latn"], + open_weights=True, + revision="a7e5882ad52c27ea2831fc8258f24379c25cb459", + release_date="2025-01-13", + n_parameters=int(306 * 1e6), # Though the second-stage model is only 140M + max_tokens=512, + embed_dim=768, + license="mit", + similarity_fn_name="cosine", + framework=["Sentence Transformers"], + reference="https://huggingface.co/jxm/cde-small-v1", + use_instructions=True, + adapted_from="answerdotai/ModernBERT-base", + superseded_by="jxm/cde-small-v2", + training_datasets=bge_full_data, + public_training_code="https://github.com/jxmorris12/cde", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", +) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index 0cf0ffe670..60ff63ee81 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -234,8 +234,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -258,8 +258,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -282,8 +282,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -306,7 +306,7 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 8753791bff..6c29510855 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -100,10 +100,13 @@ def encode( ) logger.info(f"Encoding {len(sentences)} sentences.") + if "request_qid" in kwargs: + kwargs.pop("request_qid") pred = self.model.encode( sentences, prompt_name=prompt_name, is_query=True if prompt_type == PromptType.query else False, + convert_to_tensor=True, **kwargs, ) @@ -152,18 +155,22 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: languages=["eng_Latn"], open_weights=True, revision="c1e84128e85ef755c096a95bdb06b47793b13acf", - public_training_code=True, + public_training_code=None, + public_training_data=None, release_date="2024-09-21", n_parameters=110 * 1e6, max_tokens=180, # Reduced for Benchmarking - see ColBERT paper embed_dim=None, # Bag of Embeddings (128) for each token license="mit", - similarity_fn_name="max_sim", + similarity_fn_name="MaxSim", framework=["PyLate", "ColBERT"], reference="https://huggingface.co/colbert-ir/colbertv2.0", use_instructions=False, adapted_from=None, superseded_by=None, + training_datasets={ + "MSMARCO": ["train"], # dev? + }, ) @@ -203,16 +210,22 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: ], open_weights=True, revision="4cf816e5e2b03167b132a3c847a9ecd48ba708e1", - public_training_code=False, + public_training_code=None, + public_training_data=None, release_date="2024-08-16", n_parameters=559 * 1e6, max_tokens=8192, embed_dim=None, # Bag of Embeddings (128) for each token license="cc-by-nc-4.0", - similarity_fn_name="max_sim", + similarity_fn_name="MaxSim", framework=["PyLate", "ColBERT"], reference="https://huggingface.co/jinaai/jina-colbert-v2", use_instructions=False, adapted_from=None, superseded_by=None, + training_datasets={ + "MSMARCO": ["train"], + "DuRetrieval": [], + "MIRACL": ["train"], + }, ) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index cbdc7c7e9d..58afc17976 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -6,7 +6,7 @@ from mteb.model_meta import ModelMeta -from .e5_models import E5_PAPER_RELEASE_DATE, XLMR_LANGUAGES +from .e5_models import E5_PAPER_RELEASE_DATE, E5_TRAINING_DATA, XLMR_LANGUAGES from .instruct_wrapper import instruct_wrapper MISTRAL_LANGUAGES = ["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"] @@ -15,6 +15,16 @@ E5_INSTRUCTION = "Instruct: {instruction}\nQuery: " +E5_MISTRAL_TRAINING_DATA = { + **E5_TRAINING_DATA, + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + e5_instruct = ModelMeta( loader=partial( # type: ignore instruct_wrapper, @@ -45,6 +55,9 @@ journal={arXiv preprint arXiv:2402.05672}, year={2024} }""", + public_training_code=None, + public_training_data=None, + training_datasets=E5_TRAINING_DATA, ) e5_mistral = ModelMeta( @@ -88,4 +101,82 @@ year={2022} } """, + public_training_code=None, + public_training_data=None, + training_datasets=E5_TRAINING_DATA, +) + +zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta( + loader=partial( # type: ignore + instruct_wrapper, + model_name_or_path="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + instruction_template=E5_INSTRUCTION, + attn="cccc", + pooling_method="lasttoken", + mode="embedding", + torch_dtype=torch.bfloat16, + # The ST script does not normalize while the HF one does so unclear what to do + # https://huggingface.co/intfloat/e5-mistral-7b-instruct#transformers + normalized=True, + ), + name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + revision="c791d37474fa6a5c72eb3a2522be346bc21fbfc3", + release_date="2024-08-30", + languages=["eng_Latn"], + n_parameters=7110660096, + max_tokens=32768.0, + embed_dim=4096, + license="mit", + open_weights=True, + public_training_data=None, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + similarity_fn_name="cosine", + use_instructions=True, + training_datasets={ + # copied from e5 + # source: https://arxiv.org/pdf/2212.03533 + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + # source: https://www.zeta-alpha.com/post/fine-tuning-an-llm-for-state-of-the-art-retrieval-zeta-alpha-s-top-10-submission-to-the-the-mteb-be + # "Arguana", + # "FEVER", + # "FIQA", + # "HotPotQA", + # "MsMarco (passage)", + # "NFCorpus", + # "SciFact", + # "NLI", + # "SQuad", + # "StackExchange", + # "TriviaQA", + # "SciRep", + # "SciRepEval" + # mteb + # https://huggingface.co/datasets/mteb/raw_arxiv + # "ArxivClusteringS2S": ["train"], + # "ArxivClusteringP2P": ["train"], + # https://huggingface.co/datasets/mteb/raw_biorxiv + # "BiorxivClusteringS2S": ["train"], + # "BiorxivClusteringP2P": ["train"], + # https://huggingface.co/datasets/mteb/raw_medrxiv + # "MedrxivClusteringS2S": ["train"], + # "MedrxivClusteringP2P": ["train"], + # as their train datasets + "AmazonCounterfactualClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ImdbClassification": ["train"], + "STS12": ["train"], + "STS22": ["train"], + "STSBenchmark": ["train"], + }, + adapted_from="intfloat/e5-mistral-7b-instruct", + superseded_by=None, ) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index b3429eef61..4c3c3d4790 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -131,6 +131,29 @@ PromptType.passage.value: "passage: ", } +E5_TRAINING_DATA = { + # from 4.2 in https://arxiv.org/pdf/2212.03533 + # also pre-training data from a variety of sources (stackexchange, semantic scholar, reddit, CC, ...) + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on +} + +ME5_TRAINING_DATA = { + **E5_TRAINING_DATA, + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + e5_mult_small = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -152,26 +175,9 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, citation=MULTILINGUAL_E5_CITATION, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - # table 1: - # Wikipedia 150M - # mC4 160M - # Multilingual CC News 160M - # NLLB 160M - # Reddit 160M - # S2ORC 50M - # Stackexchange 50M - # xP3 80M - # Misc. SBERT Data 10M - # ---- - # from Misc. SBERT Data 10M: - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_code=None, # couldn't find + training_datasets=ME5_TRAINING_DATA, + public_training_data=None, ) e5_mult_base = ModelMeta( @@ -193,27 +199,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=ME5_TRAINING_DATA, citation=MULTILINGUAL_E5_CITATION, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2402.05672 - # table 1: - # Wikipedia 150M - # mC4 160M - # Multilingual CC News 160M - # NLLB 160M - # Reddit 160M - # S2ORC 50M - # Stackexchange 50M - # xP3 80M - # Misc. SBERT Data 10M - # ---- - # from Misc. SBERT Data 10M: - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, ) e5_mult_large = ModelMeta( @@ -236,27 +225,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=ME5_TRAINING_DATA, citation=MULTILINGUAL_E5_CITATION, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2402.05672 - # table 1: - # Wikipedia 150M - # mC4 160M - # Multilingual CC News 160M - # NLLB 160M - # Reddit 160M - # S2ORC 50M - # Stackexchange 50M - # xP3 80M - # Misc. SBERT Data 10M - # ---- - # from Misc. SBERT Data 10M: - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, ) e5_eng_small_v2 = ModelMeta( @@ -278,15 +250,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, ) e5_eng_small = ModelMeta( @@ -309,15 +276,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, ) e5_eng_base_v2 = ModelMeta( @@ -343,14 +305,9 @@ superseded_by=None, adapted_from=None, citation=E5_CITATION, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_code=None, + public_training_data=None, + training_datasets=E5_TRAINING_DATA, ) e5_eng_large_v2 = ModelMeta( @@ -375,15 +332,10 @@ use_instructions=True, superseded_by=None, adapted_from=None, + public_training_code=None, + public_training_data=None, + training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, ) e5_large = ModelMeta( @@ -408,15 +360,10 @@ use_instructions=True, superseded_by="intfloat/e5-large-v2", adapted_from=None, + public_training_code=None, + public_training_data=None, + training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, ) e5_base = ModelMeta( @@ -441,13 +388,8 @@ use_instructions=True, superseded_by="intfloat/e5-base-v2", adapted_from=None, + public_training_code=None, + public_training_data=None, + training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, ) diff --git a/mteb/models/gme_models.py b/mteb/models/gme_models.py new file mode 100644 index 0000000000..804dfbc84d --- /dev/null +++ b/mteb/models/gme_models.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import logging + +from mteb.model_meta import ModelMeta + +logger = logging.getLogger(__name__) + + +gme_qwen2_vl_2b_instruct = ModelMeta( + loader=None, + name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + languages=["eng_Latn"], + open_weights=True, + revision="cfeb66885b598de483cc04eb08c7d9da534d7afe", + release_date="2024-12-21", + n_parameters=int(2.21 * 1e9), + max_tokens=32768, + embed_dim=1536, + license="mit", + similarity_fn_name="cosine", + framework=["PyTorch"], + reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + use_instructions=True, + adapted_from=None, + superseded_by=None, + training_datasets={ + # Only annotating text data for now + # source: https://arxiv.org/pdf/2412.16855 + "MSMARCO": ["train"], + "MSMARCO.v2": ["train"], + }, + public_training_code=None, + public_training_data=None, +) + +gme_qwen2_vl_7b_instruct = ModelMeta( + loader=None, + name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + languages=["eng_Latn"], + open_weights=True, + revision="d42eca5a540526cfa982a349724b24b25c12a95e", + release_date="2024-12-21", + n_parameters=int(8.29 * 1e9), + max_tokens=32768, + embed_dim=3584, + license="mit", + similarity_fn_name="cosine", + framework=["PyTorch"], + reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", + use_instructions=True, + adapted_from=None, + superseded_by=None, + training_datasets={ + # Only annotating text data for now + # source: https://arxiv.org/pdf/2412.16855 + "MSMARCO": ["train"], + "MSMARCO.v2": ["train"], + }, + public_training_code=None, + public_training_data=None, +) diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py index d733e80b24..40d316fee7 100644 --- a/mteb/models/google_models.py +++ b/mteb/models/google_models.py @@ -151,8 +151,8 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -174,8 +174,8 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -197,7 +197,7 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index a7be0889d0..eb23ee66bf 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -5,11 +5,24 @@ from mteb.model_meta import ModelMeta +from .e5_models import E5_TRAINING_DATA from .instruct_wrapper import instruct_wrapper logger = logging.getLogger(__name__) +GRIT_LM_TRAINING_DATA = { + **E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # also uses medi2 which contains fever and hotpotqa: + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + + def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: return ( "<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n" @@ -41,7 +54,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: open_weights=True, revision="13f00a0e36500c80ce12870ea513846a066004af", release_date="2024-02-15", - training_datasets={"GritLM/tulu2": ["train"]}, n_parameters=7_240_000_000, embed_dim=4096, license="apache-2.0", @@ -50,6 +62,10 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=GRIT_LM_TRAINING_DATA, + # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tรผlu 2 data + public_training_code="https://github.com/ContextualAI/gritlm", + public_training_data=None, citation=GRITLM_CITATION, ) gritlm8x7b = ModelMeta( @@ -62,7 +78,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: ), name="GritLM/GritLM-8x7B", languages=["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"], - training_datasets={"GritLM/tulu2": ["train"]}, open_weights=True, revision="7f089b13e3345510281733ca1e6ff871b5b4bc76", release_date="2024-02-15", @@ -74,5 +89,9 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=GRIT_LM_TRAINING_DATA, citation=GRITLM_CITATION, + # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tรผlu 2 data + public_training_code="https://github.com/ContextualAI/gritlm", + public_training_data=None, ) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index 456774a018..fb3bb6db3e 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -5,7 +5,7 @@ import torch from mteb.encoder_interface import PromptType -from mteb.model_meta import ModelMeta +from mteb.model_meta import ModelMeta, sentence_transformers_loader from mteb.models.instruct_wrapper import instruct_wrapper @@ -55,6 +55,10 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, citation=GTE_CITATION, + public_training_code=None, + public_training_data=None, + training_datasets=None, + max_tokens=131072, ) @@ -83,6 +87,9 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=None, ) @@ -111,4 +118,196 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=None, +) + +gte_small_zh = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="thenlper/gte-small-zh", + revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", + ), + name="thenlper/gte-small-zh", + languages=["zho_Hans"], + open_weights=True, + revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", + release_date="2023-11-08", # initial commit of hf model. + n_parameters=30.3 * 1e6, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/thenlper/gte-small-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, # Not disclosed +) + +gte_base_zh = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="thenlper/gte-base-zh", + revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", + ), + name="thenlper/gte-base-zh", + languages=["zho_Hans"], + open_weights=True, + revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", + release_date="2023-11-08", # initial commit of hf model. + n_parameters=102 * 1e6, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/thenlper/gte-base-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, # Not disclosed +) + +gte_large_zh = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="thenlper/gte-large-zh", + revision="64c364e579de308104a9b2c170ca009502f4f545", + ), + name="thenlper/gte-large-zh", + languages=["zho_Hans"], + open_weights=True, + revision="64c364e579de308104a9b2c170ca009502f4f545", + release_date="2023-11-08", # initial commit of hf model. + n_parameters=326 * 1e6, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/thenlper/gte-large-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, # Not disclosed +) + +gte_multilingual_langs = [ + "afr_Latn", + "ara_Arab", + "aze_Latn", + "bel_Cyrl", + "bul_Cyrl", + "ben_Beng", + "cat_Latn", + "ceb_Latn", + "ces_Latn", + "cym_Latn", + "dan_Latn", + "deu_Latn", + "ell_Grek", + "eng_Latn", + "spa_Latn", + "est_Latn", + "eus_Latn", + "fas_Arab", + "fin_Latn", + "fra_Latn", + "glg_Latn", + "guj_Gujr", + "heb_Hebr", + "hin_Deva", + "hrv_Latn", + "hat_Latn", + "hun_Latn", + "hye_Armn", + "ind_Latn", + "isl_Latn", + "ita_Latn", + "jpn_Jpan", + "jav_Latn", + "kat_Geor", + "kaz_Cyrl", + "khm_Khmr", + "kan_Knda", + "kor_Hang", + "kir_Cyrl", + "lao_Laoo", + "lit_Latn", + "lav_Latn", + "mkd_Cyrl", + "mal_Mlym", + "mon_Cyrl", + "mar_Deva", + "msa_Latn", + "mya_Mymr", + "nep_Deva", + "nld_Latn", + "nor_Latn", + "pan_Guru", + "pol_Latn", + "por_Latn", + "que_Latn", + "ron_Latn", + "rus_Cyrl", + "sin_Sinh", + "slk_Latn", + "slv_Latn", + "swa_Latn", + "tam_Taml", + "tel_Telu", + "tha_Thai", + "tgl_Latn", + "tur_Latn", + "ukr_Cyrl", + "urd_Arab", + "vie_Latn", + "yor_Latn", + "zho_Hans", +] +# Source: https://arxiv.org/pdf/2407.19669 +gte_multi_training_data = { + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CMedQAv2-reranking": ["train"], + "NQ": ["train"], + "MSMARCO": ["train"], + "HotpotQA": ["train"], + "FEVER": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], + "MultiLongDocRetrieval": ["train"], + # not in MTEB: + # - TriviaQA + # - SQuAD + # - AllNLI + # - Multi-CPR +} + +gte_multilingual_base = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="Alibaba-NLP/gte-multilingual-base", + revision="ca1791e0bcc104f6db161f27de1340241b13c5a4", + ), + name="Alibaba-NLP/gte-multilingual-base", + languages=gte_multilingual_langs, + open_weights=True, + revision="ca1791e0bcc104f6db161f27de1340241b13c5a4", + release_date="2024-07-20", # initial commit of hf model. + n_parameters=305 * 1e6, + embed_dim=1024, + license="apache-2", + max_tokens=8192, + reference="https://huggingface.co/Alibaba-NLP/gte-multilingual-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, # couldn't find + training_datasets=gte_multi_training_data, ) diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py index 394ed2d5f5..e7c3b8b022 100644 --- a/mteb/models/ibm_granite_models.py +++ b/mteb/models/ibm_granite_models.py @@ -20,6 +20,65 @@ "zho_Hans", ] +granite_training_data = { + # Multilingual MC4 + # Multilingual Webhose + # English Wikipedia + # Multilingual Wikimedia + "WikipediaRetrievalMultilingual": [], + "WikipediaRerankingMultilingual": [], + # Miracl Corpus (Title-Body) + # Stack Exchange Duplicate questions (titles) + # Stack Exchange Duplicate questions (titles) + # Stack Exchange Duplicate questions (bodies) + "StackOverflowDupQuestions": [], + "AskUbuntuDupQuestions": [], + # Stack Exchange (Title, Answer) pairs + # Stack Exchange (Title, Body) pairs + # Stack Exchange (Title, Body) pairs + # Machine Translations of Stack Exchange Duplicate questions (titles) + # Machine Translations of Stack Exchange (Title+Body, Answer) pairs + "StackExchangeClusteringP2P": [], + "StackExchangeClusteringP2P.v2": [], + "StackExchangeClustering": [], + "StackExchangeClustering.v2": [], + # SearchQA + # S2ORC (Title, Abstract) + # WikiAnswers Duplicate question pairs + # CCNews + # XSum + # SimpleWiki + # Machine Translated Cross Lingual Parallel Corpora + # SPECTER citation triplets + # Machine Translations of SPECTER citation triplets + # Natural Questions (NQ) + "NQ": ["test"], + "NQHardNegatives": ["test"], + # SQuAD2.0 + # HotpotQA + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + # Fever + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # PubMed + # Multilingual Miracl Triples + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + # Multilingual MrTydi Triples + "MrTidyRetrieval": ["train"], + # Sadeeem Question Asnwering + # DBPedia Title-Body Pairs + "DBPedia": ["train"], + # Synthetic: English Query-Wikipedia Passage + # Synthetic: English Fact Verification + # Synthetic: Multilingual Query-Wikipedia Passage + # Synthetic: Multilingual News Summaries + # IBM Internal Triples + # IBM Internal Title-Body Pairs +} granite_107m_multilingual = ModelMeta( loader=partial( # type: ignore @@ -41,6 +100,10 @@ framework=["Sentence Transformers", "PyTorch"], adapted_from=None, superseded_by=None, + public_training_code=None, + public_training_data=None, + use_instructions=False, + training_datasets=granite_training_data, ) granite_278m_multilingual = ModelMeta( @@ -63,6 +126,10 @@ framework=["Sentence Transformers", "PyTorch"], adapted_from=None, superseded_by=None, + public_training_code=None, + public_training_data=None, + use_instructions=False, + training_datasets=granite_training_data, ) granite_30m_english = ModelMeta( @@ -85,6 +152,10 @@ framework=["Sentence Transformers", "PyTorch"], adapted_from=None, superseded_by=None, + public_training_code=None, + public_training_data=None, + use_instructions=False, + training_datasets=granite_training_data, ) granite_125m_english = ModelMeta( @@ -107,4 +178,8 @@ framework=["Sentence Transformers", "PyTorch"], adapted_from=None, superseded_by=None, + public_training_code=None, + public_training_data=None, + use_instructions=False, + training_datasets=granite_training_data, ) diff --git a/mteb/models/inf_models.py b/mteb/models/inf_models.py new file mode 100644 index 0000000000..0d40ff3ef2 --- /dev/null +++ b/mteb/models/inf_models.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from functools import partial + +from mteb.model_meta import ModelMeta, sentence_transformers_loader + +inf_retriever_v1 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="infly/inf-retriever-v1", + revision="d2d074546028c0012b5cc6af78c4fac24896e67f", + trust_remote_code=True, + ), + name="infly/inf-retriever-v1", + languages=["eng_Latn", "zho_Hans"], + open_weights=True, + revision="d2d074546028c0012b5cc6af78c4fac24896e67f", + release_date="2024-12-24", # initial commit of hf model. + n_parameters=7_069_121_024, + embed_dim=3584, + license="apache-2.0", + max_tokens=131_072, + reference="https://huggingface.co/infly/inf-retriever-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct", + public_training_code=None, + public_training_data=None, + training_datasets=None, +) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 970b487ea9..dbd1615ad8 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -13,6 +13,7 @@ from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta +from .nvidia_models import nvidia_training_datasets from .wrapper import Wrapper logger = logging.getLogger(__name__) @@ -89,7 +90,8 @@ def encode( use_instructions=True, adapted_from=None, superseded_by=None, - training_datasets={ - "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], - }, + training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 + # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], + public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 192ad4cc5c..41742a2ee3 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -214,7 +214,7 @@ def encode( open_weights=True, revision="215a6e121fa0183376388ac6b1ae230326bfeaed", release_date="2024-09-18", # official release date - n_parameters=572 * 1e6, + n_parameters=int(572 * 1e6), max_tokens=8194, embed_dim=4096, license="cc-by-nc-4.0", @@ -222,6 +222,25 @@ def encode( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, reference="https://huggingface.co/jinaai/jina-embeddings-v3", + public_training_code=None, + public_training_data=None, + training_datasets={ + # CulturaX + "STS12": [], + # "SICK": [], + # "WMT19": [], + # "MADLAD-3B": [], + # NLI + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # oasst1, oasst2 + }, + adapted_from="XLM-RoBERTa", citation=""" @misc{sturua2024jinaembeddingsv3multilingualembeddingstask, title={jina-embeddings-v3: Multilingual Embeddings With Task LoRA}, @@ -234,3 +253,88 @@ def encode( } """, ) + + +jina_embeddings_v2_base_en = ModelMeta( + name="jinaai/jina-embeddings-v2-base-en", + languages=["eng-Latn"], + open_weights=True, + revision="6e85f575bc273f1fd840a658067d0157933c83f0", + release_date="2023-09-27", + n_parameters=137_000_000, + embed_dim=768, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=None, + public_training_code=None, + public_training_data=None, +) + +jina_embeddings_v2_small_en = ModelMeta( + name="jinaai/jina-embeddings-v2-small-en", + languages=["eng-Latn"], + open_weights=True, + revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", + release_date="2023-09-27", + n_parameters=32_700_000, + embed_dim=512, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=None, + public_training_code=None, + public_training_data=None, +) + +jina_embedding_b_en_v1 = ModelMeta( + name="jinaai/jina-embedding-b-en-v1", + languages=["eng-Latn"], + open_weights=True, + revision="aa0645035294a8c0607ce5bb700aba982cdff32c", + release_date="2023-07-07", + n_parameters=110_000_000, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/jinaai/jina-embedding-b-en-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by="jinaai/jina-embeddings-v2-base-en", + adapted_from=None, + training_datasets=None, + public_training_code=None, + public_training_data=None, +) + +jina_embedding_s_en_v1 = ModelMeta( + name="jinaai/jina-embedding-s-en-v1", + languages=["eng-Latn"], + open_weights=True, + revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", + release_date="2023-07-07", + n_parameters=35_000_000, + embed_dim=512, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/jinaai/jina-embedding-s-en-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by="jinaai/jina-embeddings-v2-small-en", + adapted_from=None, + training_datasets=None, + public_training_code=None, + public_training_data=None, +) diff --git a/mteb/models/lens_models.py b/mteb/models/lens_models.py new file mode 100644 index 0000000000..380724e53e --- /dev/null +++ b/mteb/models/lens_models.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +from .bge_models import bge_full_data + +lens_d4000 = ModelMeta( + loader=None, # TODO: implement this in the future + name="yibinlei/LENS-d4000", + languages=None, + open_weights=True, + revision="e473b33364e6c48a324796fd1411d3b93670c6fe", + release_date="2025-01-17", + n_parameters=int(7.11 * 1e9), + embed_dim=4000, + license="apache-2.0", + reference="https://huggingface.co/yibinlei/LENS-d4000", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_full_data, + max_tokens=32768, +) + +lens_d8000 = ModelMeta( + loader=None, # TODO: implement this in the future + name="yibinlei/LENS-d8000", + languages=None, + open_weights=True, + revision="a0b87bd91cb27b6f2f0b0fe22c28026da1d464ef", + release_date="2025-01-17", + n_parameters=int(7.11 * 1e9), + embed_dim=8000, + license="apache-2.0", + reference="https://huggingface.co/yibinlei/LENS-d8000", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_full_data, + max_tokens=32768, +) diff --git a/mteb/models/linq_models.py b/mteb/models/linq_models.py index e67ec7dec5..ead10ebf71 100644 --- a/mteb/models/linq_models.py +++ b/mteb/models/linq_models.py @@ -39,4 +39,7 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=None, ) diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index cf85c591c8..82186309db 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -20,6 +20,31 @@ def llm2vec_instruction(instruction): return instruction +llm2vec_supervised_training_data = { + # source, section g1: https://arxiv.org/pdf/2404.05961 + # splits assumed but unkown + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], +} + + class LLM2VecWrapper(Wrapper): def __init__( self, @@ -111,6 +136,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", + training_datasets=llm2vec_supervised_training_data, + public_training_data=None, citation=LLM2VEC_CITATION, ) @@ -136,6 +164,9 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["LLM2Vec", "PyTorch"], use_instructions=True, citation=LLM2VEC_CITATION, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", + training_datasets={}, + public_training_data=None, ) @@ -161,6 +192,9 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["LLM2Vec", "PyTorch"], use_instructions=True, citation=LLM2VEC_CITATION, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", + training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_mistral7b_unsupervised = ModelMeta( @@ -185,6 +219,9 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["LLM2Vec", "PyTorch"], use_instructions=True, citation=LLM2VEC_CITATION, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", + training_datasets={}, + public_training_data=None, ) llm2vec_llama2_7b_supervised = ModelMeta( @@ -209,6 +246,9 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["LLM2Vec", "PyTorch"], use_instructions=True, citation=LLM2VEC_CITATION, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", + training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_llama2_7b_unsupervised = ModelMeta( @@ -232,6 +272,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", + training_datasets={}, + public_training_data=None, citation=LLM2VEC_CITATION, ) @@ -257,6 +300,9 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["LLM2Vec", "PyTorch"], use_instructions=True, citation=LLM2VEC_CITATION, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", + training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_sheared_llama_unsupervised = ModelMeta( @@ -281,4 +327,7 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["LLM2Vec", "PyTorch"], use_instructions=True, citation=LLM2VEC_CITATION, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", + training_datasets={}, + public_training_data=None, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 488a5c8f06..140d8bac74 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -5,6 +5,10 @@ import torch from mteb.model_meta import ModelMeta, sentence_transformers_loader +from mteb.models.e5_models import E5_TRAINING_DATA + +from .bge_models import bge_m3_training_data, bge_training_data +from .sentence_transformers_models import sent_trf_training_dataset Haon_Chen__speed_embedding_7b_instruct = ModelMeta( name="Haon-Chen/speed-embedding-7b-instruct", @@ -17,8 +21,8 @@ embed_dim=None, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/Haon-Chen/speed-embedding-7b-instruct", similarity_fn_name="cosine", @@ -38,8 +42,8 @@ embed_dim=768, license=None, open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Gameselo/STS-multilingual-mpnet-base-v2", similarity_fn_name="cosine", @@ -59,8 +63,8 @@ embed_dim=896, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", similarity_fn_name="cosine", @@ -80,8 +84,8 @@ embed_dim=896, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", similarity_fn_name="cosine", @@ -101,45 +105,54 @@ embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/Hum-Works/lodestone-base-4096-v1", similarity_fn_name="cosine", use_instructions=None, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_title_body_jsonl": ["train"], - "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl": [ - "train" - ], - "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl": [ - "train" - ], - "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl": [ - "train" - ], - "sentence-transformers/reddit-title-body": ["train"], - "msmarco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], - "sentence-transformers/embedding-training-data": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_title_body_jsonl": ["train"], + # "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl": [ + # "train" + # ], + # "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl": [ + # "train" + # ], + # "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl": [ + # "train" + # ], + # "sentence-transformers/reddit-title-body": ["train"], + # "msmarco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], + # "sentence-transformers/embedding-training-data": ["train"], }, adapted_from="hum-lodestone-v1", superseded_by=None, @@ -155,8 +168,8 @@ embed_dim=2048, license=None, open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Jaume/gemma-2b-embeddings", similarity_fn_name="cosine", @@ -176,13 +189,14 @@ embed_dim=None, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"BeastyZ/E5-R": ["train"]}, + training_datasets=E5_TRAINING_DATA, + # not MTEB: {"BeastyZ/E5-R": ["train"]}, adapted_from="/ConRetriever/public_weight_mistral", superseded_by=None, ) @@ -202,8 +216,8 @@ embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-base", similarity_fn_name="cosine", @@ -228,8 +242,8 @@ embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-large", similarity_fn_name="cosine", @@ -254,8 +268,8 @@ embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-small", similarity_fn_name="cosine", @@ -275,13 +289,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Bulbasaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is GTE-tiny where training data is unknown + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/dwsdwass", superseded_by=None, ) @@ -296,13 +311,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Ivysaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is GTE-tiny where training data is unknown + # not MTEB: {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/jhjghjgh", superseded_by=None, ) @@ -317,13 +333,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Squirtle", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=bge_training_data, # source model is bge-base-en-v1.5 + # not MTEB: {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test21", superseded_by=None, ) @@ -338,13 +355,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Venusaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is unkown + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test14", superseded_by=None, ) @@ -359,13 +377,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Wartortle", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=bge_training_data, # distill from bge-base-en-v1.5 + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test22", superseded_by=None, ) @@ -380,8 +399,8 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro", similarity_fn_name="cosine", @@ -401,8 +420,8 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro-v4", similarity_fn_name="cosine", @@ -422,8 +441,8 @@ embed_dim=1024, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrdalieTech/Solon-embeddings-large-0.1", similarity_fn_name="cosine", @@ -443,13 +462,13 @@ embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets={}, # not in MTEB: {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="aubmindlab/bert-base-arabertv02", superseded_by=None, ) @@ -464,13 +483,15 @@ embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not in MTEB + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", superseded_by=None, ) @@ -485,13 +506,15 @@ embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, # derived from + # not in MTEB: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", superseded_by=None, ) @@ -506,13 +529,15 @@ embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=None, # derived from labSE + # as well as: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/LaBSE", superseded_by=None, ) @@ -527,13 +552,15 @@ embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not in MTEB: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="tomaarsen/mpnet-base-all-nli-triplet", superseded_by=None, ) @@ -548,13 +575,13 @@ embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets={}, # not in MTEB: "Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="UBC-NLP/MARBERTv2", superseded_by=None, ) @@ -569,8 +596,8 @@ embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-lunaris-text-embeddings", similarity_fn_name="cosine", @@ -590,8 +617,8 @@ embed_dim=768, license=None, open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-stellaris-text-embeddings", similarity_fn_name="cosine", @@ -611,8 +638,8 @@ embed_dim=1024, license=None, open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/bge-m3-custom-fr", similarity_fn_name="cosine", @@ -632,8 +659,8 @@ embed_dim=2048, license=None, open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.2", similarity_fn_name="cosine", @@ -653,8 +680,8 @@ embed_dim=2048, license=None, open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.3", similarity_fn_name="cosine", @@ -674,13 +701,14 @@ embed_dim=2048, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"manu/embedding_data_v2_100k": ["train"]}, + training_datasets=None, + # Not in MTEB: {"manu/embedding_data_v2_100k": ["train"]}, adapted_from="croissantllm/CroissantCool-v0.2", superseded_by=None, ) @@ -695,8 +723,8 @@ embed_dim=768, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-base", similarity_fn_name="cosine", @@ -716,8 +744,8 @@ embed_dim=1024, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-large", similarity_fn_name="cosine", @@ -737,8 +765,8 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-small", similarity_fn_name="cosine", @@ -758,8 +786,8 @@ embed_dim=768, license="gpl-3.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/KartonBERT-USE-base-v1", similarity_fn_name="cosine", @@ -779,8 +807,8 @@ embed_dim=768, license="lgpl", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/st-polish-kartonberta-base-alpha-v1", similarity_fn_name="cosine", @@ -800,8 +828,8 @@ embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-base", similarity_fn_name="cosine", @@ -821,8 +849,8 @@ embed_dim=None, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/dwzhu/e5-base-4k", similarity_fn_name="cosine", @@ -842,8 +870,8 @@ embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-large", similarity_fn_name="cosine", @@ -863,8 +891,8 @@ embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-small", similarity_fn_name="cosine", @@ -884,8 +912,8 @@ embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-base", similarity_fn_name="cosine", @@ -905,8 +933,8 @@ embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-large", similarity_fn_name="cosine", @@ -972,8 +1000,8 @@ embed_dim=None, license="bigscience-bloom-rail-1.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-1b1", similarity_fn_name="cosine", @@ -1039,8 +1067,8 @@ embed_dim=None, license="bigscience-bloom-rail-1.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-3b", similarity_fn_name="cosine", @@ -1106,8 +1134,8 @@ embed_dim=None, license="bigscience-bloom-rail-1.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-560m", similarity_fn_name="cosine", @@ -1173,8 +1201,8 @@ embed_dim=None, license="bigscience-bloom-rail-1.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-7b1", similarity_fn_name="cosine", @@ -1194,8 +1222,8 @@ embed_dim=768, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-Embedding-v0", similarity_fn_name="cosine", @@ -1215,8 +1243,8 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-all-MiniLM-L6-v2", similarity_fn_name="cosine", @@ -1236,8 +1264,8 @@ embed_dim=1024, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-large-Embedding-v0", similarity_fn_name="cosine", @@ -1257,8 +1285,8 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-small-Embedding-v0", similarity_fn_name="cosine", @@ -1278,8 +1306,8 @@ embed_dim=4096, license=None, open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/bigscience/sgpt-bloom-7b1-msmarco", similarity_fn_name="cosine", @@ -1299,13 +1327,14 @@ embed_dim=1024, license=None, open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"stsb_multi_mt": ["train"]}, + training_datasets=None, # couldn't figure out the source model + # {"stsb_multi_mt": ["train"]}, adapted_from="/content/drive/MyDrive/Stanford_NLU/Project/false_friends/gbert_large_sts_only", superseded_by=None, ) @@ -1320,8 +1349,8 @@ embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/abhinand/MedEmbed-small-v0.1", similarity_fn_name="cosine", @@ -1347,8 +1376,8 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/NoInstruct-small-Embedding-v0", similarity_fn_name="cosine", @@ -1368,8 +1397,8 @@ embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/brahmairesearch/slx-v0.1", similarity_fn_name="cosine", @@ -1389,8 +1418,8 @@ embed_dim=768, license=None, open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/deepfile/embedder-100p", similarity_fn_name="cosine", @@ -1410,24 +1439,24 @@ embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/deepvk/USER-bge-m3", similarity_fn_name="cosine", use_instructions=None, - training_datasets={ - "deepvk/ru-HNP": ["train"], - "deepvk/ru-WANLI": ["train"], - "Shitao/bge-m3-data": ["train"], - "RussianNLP/russian_super_glue": ["train"], - "reciTAL/mlsum": ["train"], - "Milana/russian_keywords": ["train"], - "IlyaGusev/gazeta": ["train"], - "d0rj/gsm8k-ru": ["train"], - "bragovo/dsum_ru": ["train"], - "CarlBrendt/Summ_Dialog_News": ["train"], - }, + training_datasets=bge_m3_training_data, # derived from. + # not in MTEB: + # "deepvk/ru-HNP": ["train"], + # "deepvk/ru-WANLI": ["train"], + # "Shitao/bge-m3-data": ["train"], + # "RussianNLP/russian_super_glue": ["train"], + # "reciTAL/mlsum": ["train"], + # "Milana/russian_keywords": ["train"], + # "IlyaGusev/gazeta": ["train"], + # "d0rj/gsm8k-ru": ["train"], + # "bragovo/dsum_ru": ["train"], + # "CarlBrendt/Summ_Dialog_News": ["train"], adapted_from="USER-bge-m3", superseded_by=None, ) @@ -1442,8 +1471,8 @@ embed_dim=None, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/infgrad/stella-base-en-v2", similarity_fn_name="cosine", @@ -1463,8 +1492,8 @@ embed_dim=1024, license=None, open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/malenia1/ternary-weight-embedding", similarity_fn_name="cosine", @@ -1484,8 +1513,8 @@ embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/omarelshehy/arabic-english-sts-matryoshka", similarity_fn_name="cosine", @@ -1515,8 +1544,8 @@ embed_dim=2304, license=None, open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/openbmb/MiniCPM-Embedding", similarity_fn_name="cosine", @@ -1546,13 +1575,14 @@ embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/shibing624/text2vec-base-multilingual", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"shibing624/nli-zh-all": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not MTEB: {"shibing624/nli-zh-all": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", superseded_by=None, ) @@ -1567,8 +1597,8 @@ embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/silma-ai/silma-embeddding-matryoshka-v0.1", similarity_fn_name="cosine", @@ -1577,24 +1607,171 @@ adapted_from="/workspace/v3-matryoshka_aubmindlab-bert-base-arabertv02-2024-10-12_13-55-06/checkpoint-26250", superseded_by=None, ) -zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta( - name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", - revision="3e6076bdc2ff592a2f95fbc04570e51db5aa0c0c", - release_date="2024-08-30", - languages=["eng_Latn"], + +sbert_chinese_general_v1 = ModelMeta( + name="DMetaSoul/sbert-chinese-general-v1", + revision="bd27765956bcc2fcf682de0097819947ac10037e", + release_date="2022-03-25", + languages=["zho_Hans"], loader=None, - n_parameters=7110660096, - max_tokens=32768.0, - embed_dim=4096, + n_parameters=None, # Not visible on repo + max_tokens=512, + embed_dim=128, + license="apache-2", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/DMetaSoul/sbert-chinese-general-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={ + "PAWSX": ["train"], + "PawsXPairClassification": ["train"], # they do not specify which one + # They might have trained on other datasets too, they don't say: + # "trained on semantically similar datasets such as NLI, PAWS-X, PKU-Paraphrase-Bank, and STS." + }, + superseded_by=None, +) +dmeta_embedding_zh_small = ModelMeta( + name="DMetaSoul/Dmeta-embedding-zh-small", + revision="2050d3439a2f68999dd648c1697471acaac37a29", + release_date="2024-03-25", + languages=["zho_Hans"], + loader=None, + n_parameters=74.2 * 1e6, + max_tokens=1024, + embed_dim=768, + license="apache-2", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, # They don't specify + superseded_by=None, +) +xiaobu_embedding = ModelMeta( + name="lier007/xiaobu-embedding", + revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92", + release_date="2024-01-09", + languages=["zho_Hans"], + loader=None, + n_parameters=326 * 1e6, + max_tokens=512, + embed_dim=1024, + license="not specified", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/lier007/xiaobu-embedding", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, # Finetuned from GTE, none of them disclose training data + superseded_by=None, + adapted_from="thenlper/gte-large-zh", +) +xiaobu_embedding_v2 = ModelMeta( + name="lier007/xiaobu-embedding-v2", + revision="1912f2e59a5c2ef802a471d735a38702a5c9485e", + release_date="2024-06-30", + languages=["zho_Hans"], + loader=None, + n_parameters=326 * 1e6, + max_tokens=512, + embed_dim=768, + license="not specified", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/lier007/xiaobu-embedding-v2", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, # Finetuned from piccolo-embedding, none of them say + superseded_by=None, + adapted_from="sensenova/piccolo-base-zh", +) +yinka_embedding = ModelMeta( + name="Classical/Yinka", + revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92", + release_date="2024-01-09", + languages=["zho_Hans"], + loader=None, + n_parameters=326 * 1e6, + max_tokens=512, + embed_dim=1024, + license="not specified", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Classical/Yinka", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, # Not disclosed + superseded_by=None, + adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d", +) +conan_embedding = ModelMeta( + name="TencentBAC/Conan-embedding-v1", + revision="bb9749a57d4f02fd71722386f8d0f5a9398d7eeb", + release_date="2024-08-22", + languages=["zho_Hans"], + loader=None, + n_parameters=326 * 1e6, + max_tokens=512, + embed_dim=768, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Classical/Yinka", + similarity_fn_name="cosine", + use_instructions=None, + # source: https://arxiv.org/pdf/2408.15710 + training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage + superseded_by=None, +) +ember_v1 = ModelMeta( + name="llmrails/ember-v1", + revision="5e5ce5904901f6ce1c353a95020f17f09e5d021d", + release_date="2023-10-10", + languages=["eng_Latn"], + n_parameters=335 * 1e6, + max_tokens=512, + embed_dim=1024, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, - framework=["PyTorch"], - reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/llmrails/ember-v1", similarity_fn_name="cosine", use_instructions=None, training_datasets=None, - adapted_from="intfloat/e5-mistral-7b-instruct", + superseded_by=None, +) +amazon_titan_text_embeddings_v2 = ModelMeta( + name="amazon/Titan-text-embeddings-v2", + revision="1", + release_date="2024-04-30", + languages=["eng_Latn"], + n_parameters=None, + max_tokens=None, + embed_dim=None, + license="proprietary", + open_weights=False, + public_training_code=None, + public_training_data=None, + framework=[], + reference="https://huggingface.co/amazon/Titan-text-embeddings-v2", + similarity_fn_name="cosine", + use_instructions=False, + training_datasets=None, superseded_by=None, ) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 37da533457..33da211c7a 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -9,6 +9,7 @@ from mteb.model_meta import ModelMeta +from .bge_models import bge_training_data from .wrapper import Wrapper logger = logging.getLogger(__name__) @@ -72,21 +73,10 @@ def encode( reference="https://huggingface.co/minishlab/M2V_base_glove_subword", use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, superseded_by=None, + training_datasets=bge_training_data, # distilled + public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) @@ -110,20 +100,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) m2v_base_output = ModelMeta( @@ -146,20 +125,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) m2v_multilingual_output = ModelMeta( @@ -182,8 +150,9 @@ def encode( use_instructions=False, adapted_from="sentence-transformers/LaBSE", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model + training_datasets=None, + public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_2m = ModelMeta( @@ -206,20 +175,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_4m = ModelMeta( @@ -242,20 +200,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_8m = ModelMeta( @@ -278,18 +225,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) diff --git a/mteb/models/moka_models.py b/mteb/models/moka_models.py new file mode 100644 index 0000000000..1504b40789 --- /dev/null +++ b/mteb/models/moka_models.py @@ -0,0 +1,147 @@ +"""Moka AI's Chinese embedding models""" + +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +sent_trf_training_dataset = { + # derived from datasheets + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], +} +medi_dataset = { + **sent_trf_training_dataset, + # not in MTEB: + # - Super-NI + # - KILT (https://arxiv.org/abs/2009.02252) + # - MedMCQA (https://proceedings.mlr.press/v174/pal22a/pal22a.pdf) +} +m3e_dataset = { + **medi_dataset, + "AmazonReviewsClassification": ["train"], # Possibly also test, hard to know + "Ocnli": ["train"], + "BQ": ["train"], + "LCQMC": ["train"], + "MIRACLReranking": ["train"], + "PAWSX": ["train"], + # not in MTEB: + # - cmrc2018 + # - belle_2m + # - firefily + # - alpaca_gpt4 + # - zhihu_kol + # - hc3_chinese + # - amazon_reviews_multi (intersects with AmazonReviewsClassification) + # - qa: Encyclopedia QA dataset + # - xlsum + # - wiki_atomic_edit + # - chatmed_consult + # - webqa + # - dureader_robust + # - csl + # - lawzhidao + # - CINLID + # - DuSQL + # - Zhuiyi-NL2SQL + # - Cspider + # - news2016zh + # - baike2018qa + # - webtext2019zh + # - SimCLUE + # - SQuAD +} + +m3e_base = ModelMeta( + name="moka-ai/m3e-base", + languages=["zho_Hans", "eng-Latn"], + open_weights=True, + revision="764b537a0e50e5c7d64db883f2d2e051cbe3c64c", + release_date="2023-06-06", # first commit + n_parameters=102 * 1e6, + embed_dim=768, + # They don't give a specific license but commercial use is not allowed + license="unspecified-noncommercial", + max_tokens=512, + reference="https://huggingface.co/moka-ai/m3e-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=None, + public_training_data=None, # Not published + training_datasets=m3e_dataset, +) + +m3e_small = ModelMeta( + name="moka-ai/m3e-small", + languages=["zho_Hans", "eng-Latn"], + open_weights=True, + revision="44c696631b2a8c200220aaaad5f987f096e986df", + release_date="2023-06-02", # first commit + n_parameters=None, # Can't be seen on HF page + embed_dim=512, + # They don't give a specific license but commercial use is not allowed + license="unspecified-noncommercial", + max_tokens=512, + reference="https://huggingface.co/moka-ai/m3e-small", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=None, + public_training_data=None, # Not published + training_datasets=m3e_dataset, +) + + +m3e_large = ModelMeta( + name="moka-ai/m3e-large", + languages=["zho_Hans", "eng-Latn"], + open_weights=True, + revision="12900375086c37ba5d83d1e417b21dc7d1d1f388", + release_date="2023-06-21", # first commit + n_parameters=None, # Can't be seen on HF page + embed_dim=768, + # They don't give a specific license but commercial use is not allowed + license="unspecified-noncommercial", + max_tokens=512, + reference="https://huggingface.co/moka-ai/m3e-large", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=None, + public_training_data=None, # Not published + training_datasets=m3e_dataset, +) diff --git a/mteb/models/mxbai_models.py b/mteb/models/mxbai_models.py index f9c8a013f5..c4bc7c3db8 100644 --- a/mteb/models/mxbai_models.py +++ b/mteb/models/mxbai_models.py @@ -41,4 +41,7 @@ year={2023} } """, + public_training_code=None, + public_training_data=None, + training_datasets=None, ) diff --git a/mteb/models/no_instruct_sentence_models.py b/mteb/models/no_instruct_sentence_models.py index 4924e316f9..9ff5cf901f 100644 --- a/mteb/models/no_instruct_sentence_models.py +++ b/mteb/models/no_instruct_sentence_models.py @@ -99,4 +99,7 @@ def encode( # type: ignore use_instructions=False, adapted_from=None, superseded_by=None, + public_training_code=None, + public_training_data=None, + training_datasets=None, ) diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index f8c9cf0c7e..c2d06e2f6e 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -90,6 +90,79 @@ def encode( # type: ignore return emb +nomic_training_data = { + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/data/contrastive_pretrain.yaml + # reddit_title_body + "RedditClustering": [], + "RedditClusteringP2P": [], + "RedditClustering.v2": [], + "RedditClusteringP2P.v2": [], + # amazon_reviews + # amazonqa + "AmazonPolarityClassification": [], + "AmazonReviewsClassification": [], + "AmazonCounterfactualClassification": [], + # paq + # s2orc_citation_titles + # s2orc_title_abstract + # s2orc_abstract_citation + # s2orc_abstract_body + # wikianswers + # wikipedia + "WikipediaRetrievalMultilingual": [], + "WikipediaRerankingMultilingual": [], + # gooaq + # codesearch + "CodeSearchNetCCRetrieval": [], + "COIRCodeSearchNetRetrieval": [], + # yahoo_title_answer + # yahoo_qa + # yahoo_title_question + "YahooAnswersTopicsClassification": [], + # agnews + # ccnews + # npr + # eli5 + # cnn + # stackexchange_duplicate_questions + # stackexchange_title_body + # stackexchange_body_body + "StackExchangeClustering.v2": [], + "StackExchangeClusteringP2P.v2": [], + # sentence_compression + # wikihow + # altlex + # quora + "QuoraRetrieval": [], + "NanoQuoraRetrieval": [], + # simplewiki + # squad + "FQuADRetrieval": [], + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/data/finetune_triplets.yaml + # msmaro + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + # nq_triples + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # nli_triplets + # reddit + # medi_wiki + # medi_stackexchange + # medi_flickr + # medi_supernli + # hotpot + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + # fever + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], +} + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/eval/mteb_eval/eval_mteb.py#L142-L159 model_prompts = { "Classification": "classification: ", @@ -138,6 +211,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_data=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, ) nomic_embed_v1 = ModelMeta( @@ -164,6 +240,9 @@ def encode( # type: ignore citation=NOMIC_CITATION, adapted_from=None, superseded_by="nomic-ai/nomic-embed-text-v1.5", + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, + public_training_data=None, ) nomic_embed_v1_ablated = ModelMeta( @@ -189,6 +268,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, + public_training_data=None, ) @@ -215,6 +297,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, + public_training_data=None, ) nomic_modern_bert_embed = ModelMeta( @@ -240,6 +325,10 @@ def encode( # type: ignore similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - adapted_from=None, + adapted_from="answerdotai/ModernBERT-base", + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_pretrain_modernbert.yaml", + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune_modernnomic.yaml superseded_by=None, + training_datasets=nomic_training_data, + public_training_data=None, ) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 08b1072cc2..1997a85274 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -72,6 +72,54 @@ def encode( return embeddings +nvidia_training_datasets = { + # source: https://arxiv.org/pdf/2405.17428 + "ArguAna": ["train"], + "ArguAna-PL": ["train"], + "NanoArguAnaRetrieval": ["train"], + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], # translation not trained on + "STS12": ["train"], + "STS22": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "ImdbClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + "STSBenchmark": ["train"], + "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on +} NV_embed_v2 = ModelMeta( loader=partial( # type: ignore NvEmbedWrapper, @@ -91,6 +139,9 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=nvidia_training_datasets, + public_training_code=None, + public_training_data=None, ) NV_embed_v1 = ModelMeta( @@ -112,4 +163,7 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=nvidia_training_datasets, + public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 9d12c388c0..079e7c9361 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -135,8 +135,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: similarity_fn_name="cosine", framework=["API"], use_instructions=False, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) text_embedding_3_large = ModelMeta( @@ -157,9 +157,11 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, + license=None, + similarity_fn_name=None, ) text_embedding_ada_002 = ModelMeta( name="openai/text-embedding-ada-002", @@ -179,7 +181,9 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, + license=None, + similarity_fn_name=None, ) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 4e19bed19c..e23285ff68 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -6,7 +6,7 @@ from typing import Any from huggingface_hub import ModelCard -from sentence_transformers import SentenceTransformer +from sentence_transformers import CrossEncoder, SentenceTransformer from mteb.abstasks.AbsTask import AbsTask from mteb.encoder_interface import Encoder @@ -15,25 +15,31 @@ arctic_models, bge_models, bm25, + cde_models, cohere_models, colbert_models, e5_instruct, e5_models, + gme_models, google_models, gritlm_models, gte_models, ibm_granite_models, + inf_models, jasper_models, jina_models, + lens_models, linq_models, llm2vec_models, misc_models, model2vec_models, + moka_models, mxbai_models, no_instruct_sentence_models, nomic_models, nvidia_models, openai_models, + piccolo_models, promptriever_models, repllama_models, rerankers_custom, @@ -42,6 +48,7 @@ salesforce_models, sentence_transformers_models, stella_models, + text2vec_models, uae_models, voyage_models, ) @@ -52,6 +59,7 @@ arctic_models, bge_models, bm25, + cde_models, cohere_models, colbert_models, e5_instruct, @@ -60,17 +68,22 @@ google_models, gritlm_models, gte_models, + gme_models, ibm_granite_models, + inf_models, jina_models, + lens_models, linq_models, llm2vec_models, mxbai_models, model2vec_models, + moka_models, misc_models, nomic_models, no_instruct_sentence_models, nvidia_models, openai_models, + piccolo_models, promptriever_models, repllama_models, rerankers_custom, @@ -85,6 +98,7 @@ jina_models, jasper_models, uae_models, + text2vec_models, stella_models, uae_models, voyage_models, @@ -157,9 +171,17 @@ def get_model(model_name: str, revision: str | None = None, **kwargs: Any) -> En model = meta.load_model(**kwargs) # If revision not available in the modelmeta, try to extract it from sentence-transformers - if meta.revision is None and isinstance(model, SentenceTransformer): - _meta = model_meta_from_sentence_transformers(model) - meta.revision = _meta.revision if _meta.revision else meta.revision + if isinstance(model.model, SentenceTransformer): + _meta = model_meta_from_sentence_transformers(model.model) + if meta.revision is None: + meta.revision = _meta.revision if _meta.revision else meta.revision + if not meta.similarity_fn_name: + meta.similarity_fn_name = _meta.similarity_fn_name + + elif isinstance(model, CrossEncoder): + _meta = model_meta_from_cross_encoder(model.model) + if meta.revision is None: + meta.revision = _meta.revision if _meta.revision else meta.revision model.mteb_model_meta = meta # type: ignore return model @@ -194,6 +216,25 @@ def get_model_meta(model_name: str, revision: str | None = None) -> ModelMeta: return meta +empty_model_meta = ModelMeta( + name=None, + revision=None, + languages=None, + release_date=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=[], +) + + @lru_cache def model_meta_from_hf_hub(model_name: str) -> ModelMeta: try: @@ -204,23 +245,57 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: frameworks.append("Sentence Transformers") return ModelMeta( name=model_name, - revision=None, + revision=card_data.get("base_model_revision", None), # TODO release_date=None, # TODO: We need a mapping between conflicting language codes languages=None, license=card_data.get("license", None), framework=frameworks, - public_training_data=bool(card_data.get("datasets", None)), + training_datasets=card_data.get("datasets", None), + similarity_fn_name=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + open_weights=True, + public_training_code=None, + public_training_data=None, + use_instructions=None, ) except Exception as e: logger.warning(f"Failed to extract metadata from model: {e}.") - return ModelMeta( - name=None, - revision=None, - languages=None, + meta = empty_model_meta + meta.name = model_name + return meta + + +def model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta: + try: + name = model.model.name_or_path + + meta = ModelMeta( + name=name, + revision=model.config._commit_hash, release_date=None, + languages=None, + framework=["Sentence Transformers"], + similarity_fn_name=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + public_training_data=None, + use_instructions=None, + training_datasets=None, + ) + except AttributeError as e: + logger.warning( + f"Failed to extract metadata from model: {e}. Upgrading to sentence-transformers v3.0.0 or above is recommended." ) + meta = empty_model_meta + return meta def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMeta: @@ -235,6 +310,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe if isinstance(model.model_card_data.language, str) else model.model_card_data.language ) + embeddings_dim = model.get_sentence_embedding_dimension() meta = ModelMeta( name=name, revision=model.model_card_data.base_model_revision, @@ -242,15 +318,19 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe languages=languages, framework=["Sentence Transformers"], similarity_fn_name=model.similarity_fn_name, + n_parameters=None, + max_tokens=None, + embed_dim=embeddings_dim, + license=None, + open_weights=True, + public_training_code=None, + public_training_data=None, + use_instructions=None, + training_datasets=None, ) except AttributeError as e: logger.warning( f"Failed to extract metadata from model: {e}. Upgrading to sentence-transformers v3.0.0 or above is recommended." ) - meta = ModelMeta( - name=None, - revision=None, - languages=None, - release_date=None, - ) + meta = empty_model_meta return meta diff --git a/mteb/models/piccolo_models.py b/mteb/models/piccolo_models.py new file mode 100644 index 0000000000..d51487b8ba --- /dev/null +++ b/mteb/models/piccolo_models.py @@ -0,0 +1,48 @@ +"""Piccolo Chinese embedding models by SenseNova""" + +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +piccolo_base_zh = ModelMeta( + name="sensenova/piccolo-base-zh", + languages=["zho_Hans"], + open_weights=True, + revision="47c0a63b8f667c3482e05b2fd45577bb19252196", + release_date="2023-09-04", # first commit + n_parameters=None, # can't see on model card + embed_dim=768, + license="mit", + max_tokens=512, + reference="https://huggingface.co/sensenova/piccolo-base-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=None, + public_training_data=None, + training_datasets=None, # They don't specify +) + +piccolo_large_zh_v2 = ModelMeta( + name="sensenova/piccolo-large-zh-v2", + languages=["zho_Hans"], + open_weights=False, # They "temporarily" removed it in may last year + # "Due to certain internal company considerations" + revision="05948c1d889355936bdf9db7d30df57dd78d25a3", + release_date="2024-04-22", # first commit + n_parameters=None, # we don't know because they removed the model + embed_dim=1024, + license="not specified", + max_tokens=512, + reference="https://huggingface.co/sensenova/piccolo-large-zh-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=None, + public_training_data=None, + training_datasets=None, # They don't say +) diff --git a/mteb/models/promptriever_models.py b/mteb/models/promptriever_models.py index 803a5ab89f..df2204defe 100644 --- a/mteb/models/promptriever_models.py +++ b/mteb/models/promptriever_models.py @@ -79,6 +79,8 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, citation=PROMPTRIEVER_CITATION, + public_training_code=None, + public_training_data=None, ) promptriever_llama3 = ModelMeta( @@ -105,6 +107,8 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, citation=PROMPTRIEVER_CITATION, + public_training_code=None, + public_training_data=None, ) @@ -132,6 +136,8 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, citation=PROMPTRIEVER_CITATION, + public_training_code=None, + public_training_data=None, ) promptriever_mistral_v1 = ModelMeta( @@ -158,4 +164,6 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, citation=PROMPTRIEVER_CITATION, + public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/repllama_models.py b/mteb/models/repllama_models.py index e132115d86..ffe1f0bd87 100644 --- a/mteb/models/repllama_models.py +++ b/mteb/models/repllama_models.py @@ -171,6 +171,8 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, citation=REPLLAMA_CITATION, + public_training_code=None, + public_training_data=None, ) @@ -197,4 +199,7 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, citation=REPLLAMA_CITATION, + public_training_code=None, + public_training_data=None, + training_datasets=None, ) diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index a4fdea8fae..34adea7ffd 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -11,6 +11,7 @@ from mteb.encoder_interface import Encoder from mteb.evaluation.evaluators.RetrievalEvaluator import DenseRetrievalExactSearch from mteb.model_meta import ModelMeta +from mteb.models.bge_models import bge_m3_training_data logger = logging.getLogger(__name__) @@ -204,6 +205,16 @@ def loader_inner(**kwargs: Any) -> Encoder: open_weights=True, revision="0a97706f3827389da43b83348d5d18c9d53876fa", release_date="2020-05-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) # languages unclear: https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual/discussions/28 @@ -219,6 +230,16 @@ def loader_inner(**kwargs: Any) -> Encoder: open_weights=True, revision="126747772a932960028d9f4dc93bd5d9c4869be4", release_date="2024-09-26", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) bge_reranker_v2_m3 = ModelMeta( @@ -266,6 +287,16 @@ def loader_inner(**kwargs: Any) -> Encoder: open_weights=True, revision="953dc6f6f85a1b2dbfca4c34a2796e7dde08d41e", release_date="2024-06-24", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=bge_m3_training_data, + framework=["Sentence Transformers", "PyTorch"], citation=""" @misc{li2023making, title={Making Large Language Models A Better Foundation For Dense Retrieval}, diff --git a/mteb/models/rerankers_monot5_based.py b/mteb/models/rerankers_monot5_based.py index e0443bde7a..320ee4bc7d 100644 --- a/mteb/models/rerankers_monot5_based.py +++ b/mteb/models/rerankers_monot5_based.py @@ -296,6 +296,16 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="77f8e3f7b1eb1afe353aa21a7c3a2fc8feca702e", release_date="2022-03-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], citation="""@misc{rosa2022parameterleftbehinddistillation, title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval}, author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira}, @@ -328,6 +338,16 @@ def get_prediction_tokens(self, *args, **kwargs): primaryClass={cs.IR}, url={https://arxiv.org/abs/2206.02873}, }""", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) monot5_large = ModelMeta( @@ -342,6 +362,16 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="48cfad1d8dd587670393f27ee8ec41fde63e3d98", release_date="2022-03-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], citation="""@misc{rosa2022parameterleftbehinddistillation, title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval}, author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira}, @@ -365,6 +395,16 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="bc0c419a438c81f592f878ce32430a1823f5db6c", release_date="2022-03-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], citation="""@misc{rosa2022parameterleftbehinddistillation, title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval}, author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira}, @@ -411,6 +451,15 @@ def get_prediction_tokens(self, *args, **kwargs): "quasc": ["train"], "qed": ["train"], }, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) flant5_large = ModelMeta( @@ -448,6 +497,15 @@ def get_prediction_tokens(self, *args, **kwargs): "quasc": ["train"], "qed": ["train"], }, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) flant5_xl = ModelMeta( @@ -485,6 +543,15 @@ def get_prediction_tokens(self, *args, **kwargs): "quasc": ["train"], "qed": ["train"], }, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) flant5_xxl = ModelMeta( @@ -522,6 +589,15 @@ def get_prediction_tokens(self, *args, **kwargs): "quasc": ["train"], "qed": ["train"], }, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) @@ -537,6 +613,16 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9", release_date="2023-07-18", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], citation="""@misc{touvron2023llama2openfoundation, title={Llama 2: Open Foundation and Fine-Tuned Chat Models}, author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic and Sergey Edunov and Thomas Scialom}, @@ -569,6 +655,16 @@ def get_prediction_tokens(self, *args, **kwargs): primaryClass={cs.CL}, url={https://arxiv.org/abs/2307.09288}, }""", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) mistral_7b = ModelMeta( @@ -583,6 +679,16 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="3ad372fc79158a2148299e3318516c786aeded6c", release_date="2023-12-11", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], citation="""@misc{jiang2023mistral7b, title={Mistral 7B}, author={Albert Q. Jiang and Alexandre Sablayrolles and Arthur Mensch and Chris Bamford and Devendra Singh Chaplot and Diego de las Casas and Florian Bressand and Gianna Lengyel and Guillaume Lample and Lucile Saulnier and Lรฉlio Renard Lavaud and Marie-Anne Lachaux and Pierre Stock and Teven Le Scao and Thibaut Lavril and Thomas Wang and Timothรฉe Lacroix and William El Sayed}, @@ -607,6 +713,15 @@ def get_prediction_tokens(self, *args, **kwargs): revision="4d25d437e38b510c01852070c0731e8f6e1875d1", release_date="2024-04-29", training_datasets={"jhu-clsp/FollowIR-train": ["train"]}, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], citation=""" @misc{weller2024followir, title={FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions}, @@ -746,6 +861,15 @@ def get_prediction_tokens(self, *args, **kwargs): } """, training_datasets={"msmarco": ["train"]}, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) mt5_13b_mmarco_100k = ModelMeta( @@ -760,4 +884,14 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="e1a4317e102a525ea9e16745ad21394a4f1bffbc", release_date="2022-11-04", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 16146b212a..4feac0ebc1 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -6,36 +6,53 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader -rubert_tiny2 = ModelMeta( - name="cointegrated/rubert-tiny2", +from .bge_models import bge_m3_training_data + +rubert_tiny = ModelMeta( + name="cointegrated/rubert-tiny", languages=["rus_Cyrl"], open_weights=True, - revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", - release_date="2021-10-28", + revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", + release_date="2021-05-24", n_parameters=29_400_000, embed_dim=312, license="mit", max_tokens=2048, - reference="https://huggingface.co/cointegrated/rubert-tiny2", + reference="https://huggingface.co/cointegrated/rubert-tiny", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code="https://gist.github.com/avidale/7bc6350f26196918bf339c01261f5c60", + training_datasets={ + # [Yandex Translate corpus](https://translate.yandex.ru/corpus), [OPUS-100](https://huggingface.co/datasets/opus100) + "Tatoeba": ["train"], + }, + adapted_from="google-bert/bert-base-multilingual-cased", + public_training_data=None, ) -rubert_tiny = ModelMeta( - name="cointegrated/rubert-tiny", +rubert_tiny2 = ModelMeta( + name="cointegrated/rubert-tiny2", languages=["rus_Cyrl"], open_weights=True, - revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", - release_date="2021-05-24", + revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", + release_date="2021-10-28", n_parameters=29_400_000, embed_dim=312, license="mit", max_tokens=2048, - reference="https://huggingface.co/cointegrated/rubert-tiny", + reference="https://huggingface.co/cointegrated/rubert-tiny2", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code="https://colab.research.google.com/drive/1mSWfIQ6PIlteLVZ9DKKpcorycgLIKZLf?usp=sharing", + training_datasets={ + # https://huggingface.co/datasets/cointegrated/ru-paraphrase-NMT-Leipzig + # Wikipedia https://huggingface.co/datasets/Madjogger/JamSpell_dataset + # https://huggingface.co/datasets/imvladikon/leipzig_corpora_collection + }, + adapted_from="cointegrated/rubert-tiny", + public_training_data=None, ) sbert_large_nlu_ru = ModelMeta( @@ -52,6 +69,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, ) sbert_large_mt_nlu_ru = ModelMeta( @@ -68,6 +88,12 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets={ + # SNLI, MNLI + # https://github.com/brmson/dataset-sts + }, ) user_base_ru = ModelMeta( @@ -83,12 +109,13 @@ revision="436a489a2087d61aa670b3496a9915f84e46c861", release_date="2024-06-10", n_parameters=427_000_000, - embed_dim=1024, - license="Not specified", - max_tokens=512, # best guess - reference="https://huggingface.co/ai-forever/sbert_large_mt_nlu_ru", + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/deepvk/USER-base", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], + adapted_from="https://huggingface.co/deepvk/deberta-v1-base", use_instructions=True, citation="""@misc{deepvk2024user, title={USER: Universal Sentence Encoder for Russian}, @@ -99,21 +126,140 @@ } """, training_datasets={ - "deepvk/ru-HNP": ["train"], - "deepvk/ru-WANLI": ["train"], - "Shitao/bge-m3-data": ["train"], - "RussianNLP/russian_super_glue": ["train"], - "reciTAL/mlsum": ["train"], - "Helsinki-NLP/opus-100": ["train"], - "Helsinki-NLP/bible_para": ["train"], - "d0rj/rudetoxifier_data_detox": ["train"], - "s-nlp/ru_paradetox": ["train"], - "Milana/russian_keywords": ["train"], - "IlyaGusev/gazeta": ["train"], - "d0rj/gsm8k-ru": ["train"], - "bragovo/dsum_ru": ["train"], - "CarlBrendt/Summ_Dialog_News": ["train"], + "BibleNLPBitextMining": ["train"], + # https://github.com/unicamp-dl/mMARCO + # deepvk/ru-HNP + # deepvk/ru-WANLI + # MedNLI + # RCB + "TERRa": ["train"], + # Tapaco + # Opus100 + # BiblePar + # RudetoxifierDataDetox + # RuParadetox + "MIRACL": ["train"], + # MLDR + # Lenta + "MLSUMClusteringP2P": ["train"], + "MLSUMClusteringP2P.v2": ["train"], + "MLSUMClusteringS2S": ["train"], + "MLSUMClusteringS2S.v2": ["train"], + "MrTidyRetrieval": ["train"], + # "Panorama" + # PravoIsrael + # xlsum + # Fialka-v1 + # RussianKeywords + # Gazeta + # Gsm8k-ru + # DSumRu + # SummDialogNews + }, + public_training_code=None, + public_training_data=None, +) + +user_bge_m3 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="deepvk/USER-bge-m3", + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + ), + name="deepvk/USER-bge-m3", + languages=["rus_Cyrl"], + open_weights=True, + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + release_date="2024-07-05", + n_parameters=359_026_688, + embed_dim=1024, + license="apache-2.0", + max_tokens=8194, + reference="https://huggingface.co/deepvk/USER-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + adapted_from="https://huggingface.co/BAAI/bge-m3", + use_instructions=False, + training_datasets={ + "BibleNLPBitextMining": ["train"], + # https://github.com/unicamp-dl/mMARCO + # deepvk/ru-HNP + # deepvk/ru-WANLI + # MedNLI + # RCB + "TERRa": ["train"], + # Tapaco + # Opus100 + # BiblePar + # RudetoxifierDataDetox + # RuParadetox + "MIRACL": ["train"], + # MLDR + # Lenta + "MLSUMClusteringP2P": ["train"], + "MLSUMClusteringP2P.v2": ["train"], + "MLSUMClusteringS2S": ["train"], + "MLSUMClusteringS2S.v2": ["train"], + "MrTidyRetrieval": ["train"], + # "Panorama" + # PravoIsrael + # xlsum + # Fialka-v1 + # RussianKeywords + # Gazeta + # Gsm8k-ru + # DSumRu + # SummDialogNews + }, + public_training_code=None, + public_training_data=None, +) + +user_bge_m3 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="deepvk/USER-bge-m3", + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + ), + name="deepvk/USER-bge-m3", + languages=["rus_Cyrl"], + open_weights=True, + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + release_date="2024-07-05", + n_parameters=359_026_688, + embed_dim=1024, + license="apache-2.0", + max_tokens=8194, + reference="https://huggingface.co/deepvk/USER-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + adapted_from="https://huggingface.co/BAAI/bge-m3", + use_instructions=False, + training_datasets={ + "BibleNLPBitextMining": ["train"], + "MLSUMClusteringP2P": ["train"], + "MLSUMClusteringP2P.v2": ["train"], + "MLSUMClusteringS2S": ["train"], + "MLSUMClusteringS2S.v2": ["train"], + **bge_m3_training_data, + # not MTEB: + # "deepvk/ru-HNP": ["train"], + # "deepvk/ru-WANLI": ["train"], + # "Shitao/bge-m3-data": ["train"], + # "RussianNLP/russian_super_glue": ["train"], + # "reciTAL/mlsum": ["train"], + # "Helsinki-NLP/opus-100": ["train"], + # "Helsinki-NLP/bible_para": ["train"], + # "d0rj/rudetoxifier_data_detox": ["train"], + # "s-nlp/ru_paradetox": ["train"], + # "Milana/russian_keywords": ["train"], + # "IlyaGusev/gazeta": ["train"], + # "d0rj/gsm8k-ru": ["train"], + # "bragovo/dsum_ru": ["train"], + # "CarlBrendt/Summ_Dialog_News": ["train"], }, + public_training_code=None, + public_training_data=None, ) deberta_v1_ru = ModelMeta( @@ -130,6 +276,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + # Wikipedia, Books, Twitter comments, Pikabu, Proza.ru, Film subtitles, News websites, and Social corpus + public_training_code=None, + public_training_data=None, + training_datasets=None, ) rubert_base_cased = ModelMeta( @@ -141,11 +291,14 @@ n_parameters=1280_000_000, embed_dim=768, license="Not specified", - max_tokens=512, # best guess + max_tokens=512, reference="https://huggingface.co/DeepPavlov/rubert-base-cased", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, citation="""@misc{kuratov2019adaptationdeepbidirectionalmultilingual, title={Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language}, author={Yuri Kuratov and Mikhail Arkhipov}, @@ -171,6 +324,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets=None, citation="""@misc{https://doi.org/10.48550/arxiv.2205.02340, doi = {10.48550/ARXIV.2205.02340}, url = {https://arxiv.org/abs/2205.02340}, @@ -197,6 +353,12 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets={ + # "SNLI": [], + "XNLI": ["dev"] + }, ) labse_en_ru = ModelMeta( @@ -213,6 +375,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code="https://colab.research.google.com/drive/1dnPRn0-ugj3vZgSpyCC9sgslM2SuSfHy?usp=sharing", + public_training_data=None, + training_datasets=None, + adapted_from="sentence-transformers/LaBSE", ) rubert_tiny_turbo = ModelMeta( @@ -229,7 +395,11 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets={"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + public_training_code=None, + public_training_data=None, + training_datasets=None, # source model in unknown + # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + adapted_from="cointegrated/rubert-tiny2", ) labse_ru_turbo = ModelMeta( @@ -246,7 +416,11 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets={"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=None, + # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + public_training_code=None, + adapted_from="cointegrated/LaBSE-en-ru", + public_training_data=None, ) @@ -268,6 +442,30 @@ revision="89fb1651989adbb1cfcfdedafd7d102951ad0555", release_date="2024-07-29", use_instructions=True, + n_parameters=404_000_000, + max_tokens=514, + embed_dim=1024, + license="mit", + similarity_fn_name="cosine", + adapted_from="ai-forever/ruRoberta-large", + training_datasets={ + # https://huggingface.co/ai-forever/ruRoberta-large + # https://huggingface.co/datasets/IlyaGusev/yandex_q_full + # https://huggingface.co/datasets/IlyaGusev/pikabu + # https://huggingface.co/datasets/IlyaGusev/ru_stackoverflow + # https://huggingface.co/datasets/IlyaGusev/habr + # https://huggingface.co/datasets/its5Q/habr_qna + # NewsCommentary + # MultiParaCrawl + "XNLI": [], + "XNLIV2": [], + "LanguageClassification": [], # XNLI + "MIRACLReranking": ["train"], + "MIRACLRetrieval": ["train"], + }, + public_training_data=None, + public_training_code=None, + framework=["Sentence Transformers", "PyTorch"], citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb, title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, author={Artem Snegirev and Maria Tikhonova and Anna Maksimova and Alena Fenogenova and Alexander Abramov}, diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index ece7121bd0..c5ba799338 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -6,6 +6,8 @@ from mteb.model_meta import ModelMeta from mteb.models.instruct_wrapper import instruct_wrapper +from .e5_instruct import E5_MISTRAL_TRAINING_DATA + def instruction_template( instruction: str, prompt_type: PromptType | None = None @@ -13,6 +15,19 @@ def instruction_template( return f"Instruct: {instruction}\nQuery: " if instruction else "" +SFR_TRAINING_DATA = { # inherits from e5 + **E5_MISTRAL_TRAINING_DATA, + # From previously released blogpost which now have been taken down: + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + SFR_Embedding_2_R = ModelMeta( loader=partial( # type: ignore instruct_wrapper, @@ -39,6 +54,10 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + adapted_from="intfloat/e5-mistral-7b-instruct", + public_training_code=None, + public_training_data=None, + training_datasets=SFR_TRAINING_DATA, citation="""@misc{SFR-embedding-2, title={SFR-Embedding-2: Advanced Text Embedding with Multi-stage Training}, author={Rui Meng*, Ye Liu*, Shafiq Rayhan Joty, Caiming Xiong, Yingbo Zhou, Semih Yavuz}, @@ -73,4 +92,7 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=SFR_TRAINING_DATA, ) diff --git a/mteb/models/sentence_transformer_wrapper.py b/mteb/models/sentence_transformer_wrapper.py index 9ec25a9896..bb47467838 100644 --- a/mteb/models/sentence_transformer_wrapper.py +++ b/mteb/models/sentence_transformer_wrapper.py @@ -21,7 +21,6 @@ def __init__( model: str | SentenceTransformer | CrossEncoder, revision: str | None = None, model_prompts: dict[str, str] | None = None, - similarity_fn_name: str | None = None, **kwargs, ) -> None: """Wrapper for SentenceTransformer models. @@ -33,7 +32,6 @@ def __init__( First priority is given to the composed prompt of task name + prompt type (query or passage), then to the specific task prompt, then to the composed prompt of task type + prompt type, then to the specific task type prompt, and finally to the specific prompt type. - similarity_fn_name: A similarity function to use. **kwargs: Additional arguments to pass to the SentenceTransformer model. """ if isinstance(model, str): @@ -61,9 +59,7 @@ def __init__( if isinstance(self.model, CrossEncoder): self.predict = self.handle_instructions_predict - if similarity_fn_name: - self.similarity = self.get_similarity_function(similarity_fn_name) - elif hasattr(self.model, "similarity") and callable(self.model.similarity): + if hasattr(self.model, "similarity") and callable(self.model.similarity): self.similarity = self.model.similarity def encode( diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 3616ad8822..73dcf8a666 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -76,6 +76,41 @@ } """ + +sent_trf_training_dataset = { + # derived from datasheets + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], +} + all_MiniLM_L6_v2 = ModelMeta( name="sentence-transformers/all-MiniLM-L6-v2", languages=["eng-Latn"], @@ -92,43 +127,34 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # does sentence transformer count? - public_training_data=True, - training_datasets={ - # source: frontmatter in readme - # trained on stack exchange, unsure if sources match - "StackExchangeClusteringP2P": ["test"], - "StackExchangeClusteringP2P.v2": ["test"], - "StackExchangeClustering": ["test"], - "StackExchangeClustering.v2": ["test"], - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], - # Non MTEB sources - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, + training_datasets=sent_trf_training_dataset, + public_training_code=None, + public_training_data=None, citation=SBERT_CITATION, ) +all_MiniLM_L12_v2 = ModelMeta( + name="sentence-transformers/all-MiniLM-L12-v2", + languages=["eng-Latn"], + open_weights=True, + revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", + release_date="2021-08-30", + n_parameters=33_400_000, + embed_dim=384, + license="apache-2.0", + max_tokens=256, + reference="https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=sent_trf_training_dataset, + public_training_code=None, + citation=SBERT_CITATION, + public_training_data=None, +) + paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta( name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", languages=paraphrase_langs, @@ -145,7 +171,10 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) + public_training_code=None, citation=SBERT_CITATION, + public_training_data=None, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( @@ -165,6 +194,20 @@ superseded_by=None, adapted_from=None, citation=SBERT_CITATION, + training_datasets=sent_trf_training_dataset, + # + https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/paraphrases/training.py + # which include (not in MTEB): + # "all-nli": all_nli_train_dataset, + # "sentence-compression": sentence_compression_train_dataset, + # "simple-wiki": simple_wiki_train_dataset, + # "altlex": altlex_train_dataset, + # "quora-duplicates": quora_train_dataset, + # "coco-captions": coco_train_dataset, + # "flickr30k-captions": flickr_train_dataset, + # "yahoo-answers": yahoo_answers_train_dataset, + # "stack-exchange": stack_exchange_train_dataset, + public_training_code=None, + public_training_data=None, ) labse = ModelMeta( @@ -183,6 +226,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=None, # scraped and mined webdata including CC, wiki, see section 3.1 https://aclanthology.org/2022.acl-long.62.pdf + public_training_code="https://www.kaggle.com/models/google/labse/tensorFlow2/labse/2?tfhub-redirect=true", citation="""@misc{feng2022languageagnosticbertsentenceembedding, title={Language-agnostic BERT Sentence Embedding}, author={Fangxiaoyu Feng and Yinfei Yang and Daniel Cer and Naveen Arivazhagan and Wei Wang}, @@ -192,6 +237,7 @@ primaryClass={cs.CL}, url={https://arxiv.org/abs/2007.01852}, }""", + public_training_data=None, ) multi_qa_MiniLM_L6_cos_v1 = ModelMeta( @@ -209,7 +255,11 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, superseded_by=None, - adapted_from=None, + adapted_from="nreimers/MiniLM-L6-H384-uncased", + training_datasets=sent_trf_training_dataset, # assumed + public_training_code=None, + public_training_data=None, + citation=SBERT_CITATION, ) all_mpnet_base_v2 = ModelMeta( @@ -228,177 +278,10 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # does sentence transformer count? - public_training_data=True, - training_datasets={ - # source: frontmatter in readme - # trained on stack exchange, unsure if sources match - "StackExchangeClusteringP2P": ["test"], - "StackExchangeClusteringP2P.v2": ["test"], - "StackExchangeClustering": ["test"], - "StackExchangeClustering.v2": ["test"], - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], - # Non MTEB sources - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, -) - -jina_embeddings_v2_base_en = ModelMeta( - name="jinaai/jina-embeddings-v2-base-en", - languages=["eng-Latn"], - open_weights=True, - revision="6e85f575bc273f1fd840a658067d0157933c83f0", - release_date="2023-09-27", - n_parameters=137_000_000, - embed_dim=768, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets={"allenai/c4": ["train"]}, -) - -jina_embeddings_v2_small_en = ModelMeta( - name="jinaai/jina-embeddings-v2-small-en", - languages=["eng-Latn"], - open_weights=True, - revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", - release_date="2023-09-27", - n_parameters=32_700_000, - embed_dim=512, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, -) - -jina_embedding_b_en_v1 = ModelMeta( - name="jinaai/jina-embedding-b-en-v1", - languages=["eng-Latn"], - open_weights=True, - revision="aa0645035294a8c0607ce5bb700aba982cdff32c", - release_date="2023-07-07", - n_parameters=110_000_000, - embed_dim=768, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/jinaai/jina-embedding-b-en-v1", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by="jinaai/jina-embeddings-v2-base-en", - adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, -) - -jina_embedding_s_en_v1 = ModelMeta( - name="jinaai/jina-embedding-s-en-v1", - languages=["eng-Latn"], - open_weights=True, - revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", - release_date="2023-07-07", - n_parameters=35_000_000, - embed_dim=512, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/jinaai/jina-embedding-s-en-v1", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by="jinaai/jina-embeddings-v2-small-en", - adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, -) - -all_MiniLM_L12_v2 = ModelMeta( - name="sentence-transformers/all-MiniLM-L12-v2", - languages=["eng-Latn"], - open_weights=True, - revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", - release_date="2021-08-30", - n_parameters=33_400_000, - embed_dim=384, - license="apache-2.0", - max_tokens=256, - reference="https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - citation="""@misc{feng2022languageagnosticbertsentenceembedding, - title={Language-agnostic BERT Sentence Embedding}, - author={Fangxiaoyu Feng and Yinfei Yang and Daniel Cer and Naveen Arivazhagan and Wei Wang}, - year={2022}, - eprint={2007.01852}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2007.01852}, - }""", - public_training_code=False, # does sentence transformer count? - public_training_data=True, - training_datasets={ - # source: frontmatter in readme - # trained on stack exchange, unsure if sources match - "StackExchangeClusteringP2P": ["test"], - "StackExchangeClusteringP2P.v2": ["test"], - "StackExchangeClustering": ["test"], - "StackExchangeClustering.v2": ["test"], - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], - # Non MTEB sources - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, + training_datasets=sent_trf_training_dataset, + public_training_code=None, + public_training_data=None, + citation=SBERT_CITATION, ) contriever = ModelMeta( @@ -429,6 +312,9 @@ url = {https://arxiv.org/abs/2112.09118}, doi = {10.48550/ARXIV.2112.09118}, }""", + public_training_code=None, + public_training_data=None, + training_datasets=None, ) microllama_text_embedding = ModelMeta( @@ -448,12 +334,16 @@ superseded_by=None, adapted_from=None, training_datasets={ - # shource yaml header: - "NQ": ["test"] - # not in MTEB: + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB # "sentence-transformers/all-nli": ["train"], # "sentence-transformers/stsb": ["train"], # "sentence-transformers/quora-duplicates": ["train"], # "sentence-transformers/natural-questions": ["train"], }, + public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index a738f4461e..92d5db7c8a 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -28,6 +28,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", + training_datasets=None, + # will be at https://github.com/NLPJCL/RAG-Retrieval + public_training_code=None, + public_training_data=None, ) stella_en_1_5b = ModelMeta( @@ -52,4 +56,113 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", + # will be at https://github.com/NLPJCL/RAG-Retrieval + training_datasets=None, + public_training_code=None, + public_training_data=None, +) + +stella_large_zh_v3_1792d = ModelMeta( + name="dunzhang/stella-large-zh-v3-1792d", + languages=["zho_Hans"], + open_weights=True, + revision="d5d39eb8cd11c80a63df53314e59997074469f09", + release_date="2024-02-17", + n_parameters=None, # can't see on model card + embed_dim=1792, + license="not specified", + max_tokens=512, + reference="https://huggingface.co/dunzhang/stella-large-zh-v3-1792d", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by="dunzhang/stella-mrl-large-zh-v3.5-1792d", + adapted_from=None, + public_training_code=None, + public_training_data=None, + training_datasets={ + # Not in MTEB: + # - infgrad/dialogue_rewrite_llm + # - infgrad/retrieval_data_llm + }, +) + +stella_base_zh_v3_1792d = ModelMeta( + name="infgrad/stella-base-zh-v3-1792d", + languages=["zho_Hans"], + open_weights=True, + revision="82254892a0fba125aa2abf3a4800d2dd12821343", + release_date="2024-02-17", + n_parameters=None, # can't see on model card + embed_dim=1792, + license="mit", + max_tokens=512, + reference="https://huggingface.co/infgrad/stella-base-zh-v3-1792d", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=None, + public_training_data=None, + training_datasets={ + # Not in MTEB: + # - infgrad/dialogue_rewrite_llm + # - infgrad/retrieval_data_llm + }, +) + + +stella_mrl_large_zh_v3_5_1792d = ModelMeta( + name="dunzhang/stella-mrl-large-zh-v3.5-1792d", + languages=["zho_Hans"], + open_weights=True, + revision="17bb1c32a93a8fc5f6fc9e91d5ea86da99983cfe", + release_date="2024-02-27", + n_parameters=326 * 1e6, + embed_dim=1792, + license="mit", + max_tokens=512, + reference="https://huggingface.co/dunzhang/stella-large-zh-v3-1792d", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from="dunzhang/stella-large-zh-v3-1792d", + public_training_code=None, + public_training_data=None, + training_datasets=None, # Not specified +) + +zpoint_large_embedding_zh = ModelMeta( + name="iampanda/zpoint_large_embedding_zh", + languages=["zho_Hans"], + open_weights=True, + revision="b1075144f440ab4409c05622c1179130ebd57d03", + release_date="2024-06-04", + n_parameters=326 * 1e6, + embed_dim=1792, + license="mit", + max_tokens=512, + reference="https://huggingface.co/iampanda/zpoint_large_embedding_zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d", + public_training_code=None, + public_training_data=None, + training_datasets={ + # It's a bit unclear what they have trained on to be honest, because they don't list all + # And they also have some rather cryptic description of their training procedure, but at + # Least they disclose that they have trained on these: + "MIRACLRetrieval": ["train"], + "MIRACLReranking": ["train"], + "DuRetrieval": ["train"], + "T2Retrieval": ["train"], + "MultiLongDocRetrieval": ["train"], + # Not in MTEB: + # - Shitao/bge-reranker-data + # - FreedomIntelligence/Huatuo26M-Lite + }, ) diff --git a/mteb/models/text2vec_models.py b/mteb/models/text2vec_models.py new file mode 100644 index 0000000000..86a9bcca4f --- /dev/null +++ b/mteb/models/text2vec_models.py @@ -0,0 +1,100 @@ +"""Implementation of Text2Vec models""" + +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +# I couldn't find the large model on HF for some reason +text2vec_base_chinese = ModelMeta( + name="shibing624/text2vec-base-chinese", + languages=["zho-Hans"], + open_weights=True, + revision="183bb99aa7af74355fb58d16edf8c13ae7c5433e", + release_date="2022-01-23", + n_parameters=102 * 1e6, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/shibing624/text2vec-base-chinese", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=None, + public_training_data=None, # Couldn't find it + training_datasets={ + # source: https://huggingface.co/shibing624/text2vec-base-chinese + # Not in MTEB + # - shibing624/nli-zh-all/text2vec-base-chinese-sentence-dataset + # (Could have overlaps I'm not aware of) + }, +) + +text2vec_base_chinese_paraphrase = ModelMeta( + name="shibing624/text2vec-base-chinese-paraphrase", + languages=["zho-Hans"], + open_weights=True, + revision="e90c150a9c7fb55a67712a766d6820c55fb83cdd", + release_date="2023-06-19", + n_parameters=118 * 1e6, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=None, + public_training_data=None, # Couldn't find it + training_datasets={ + # source: https://huggingface.co/shibing624/text2vec-base-chinese + # Not in MTEB + # - shibing624/nli-zh-all/text2vec-base-chinese-paraphrase + # (Could have overlaps I'm not aware of) + }, +) + + +text2vec_multi_langs = [ + "deu-Latn", # German (de) + "eng-Latn", # English (en) + "spa-Latn", # Spanish (es) + "fra-Latn", # French (fr) + "ita-Latn", # Italian (it) + "nld-Latn", # Dutch (nl) + "pol-Latn", # Polish (pl) + "por-Latn", # Portuguese (pt) + "rus-Cyrl", # Russian (ru) + "zho-Hans", # Chinese (Simplified, zh) +] +text2vec_base_multilingual = ModelMeta( + name="shibing624/text2vec-base-multilingual", + languages=text2vec_multi_langs, + open_weights=True, + revision="6633dc49e554de7105458f8f2e96445c6598e9d1", + release_date="2023-06-22", + # While it can be loaded with SBERT, it has one suspicious file according to huggingface + # So probably best not to. + loader=None, + n_parameters=118 * 1e6, + embed_dim=384, + license="apache-2.0", + max_tokens=256, + reference="https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + public_training_code=None, + public_training_data=None, # Couldn't find it + training_datasets={ + # source: https://huggingface.co/shibing624/text2vec-base-chinese + # Not in MTEB + # - shibing624/nli-zh-all/tree/main/text2vec-base-multilingual-dataset + # # (Could have overlaps I'm not aware of) + }, +) diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index cb83d57c77..a12a936326 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -83,4 +83,13 @@ def encode( year={2023} } """, + training_datasets={ + # source: https://arxiv.org/pdf/2309.12871 + # not in MTEB + "MNLI": [], + "NLI": [], + "SNLI": [], + }, + public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index 02078e4cde..a637dee36a 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -12,6 +12,11 @@ from .wrapper import Wrapper +VOYAGE_TRAINING_DATA = { + # Self-reported (message from VoyageAI member) + # synthetic data +} + def token_limit(max_tpm: int, interval: int = 60): limit_interval_start_ts = time.time() @@ -156,6 +161,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, ) voyage_finance_2 = ModelMeta( @@ -177,6 +185,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, ) voyage_law_2 = ModelMeta( @@ -198,6 +209,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, ) voyage_code_2 = ModelMeta( @@ -219,6 +233,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, ) voyage_large_2 = ModelMeta( @@ -240,6 +257,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, ) voyage_2 = ModelMeta( @@ -261,6 +281,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", @@ -281,6 +304,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, ) voyage_3 = ModelMeta( @@ -302,6 +328,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, ) voyage_3_lite = ModelMeta( @@ -323,4 +352,79 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, +) + + +voyage_3_exp = ModelMeta( + name="voyageai/voyage-3-m-exp", + revision="1", + release_date=None, # not released + languages=None, # supported languages not specified + loader=partial( + VoyageWrapper, + model_name="voyage-3-m-exp", + model_prompts=model_prompts, + ), + max_tokens=32000, + embed_dim=512, + open_weights=False, + n_parameters=None, + license=None, + reference="https://huggingface.co/voyageai/voyage-3-m-exp", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, + training_datasets={ + # MTEB(eng, classic) training data: + "ArguAna": ["train"], + "ArguAna-PL": ["train"], + "NanoArguAnaRetrieval": ["train"], + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], # translation not trained on + "STS12": ["train"], + "STS22": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "ImdbClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + "STSBenchmark": ["train"], + "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on + }, + public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/wrapper.py b/mteb/models/wrapper.py index 76b31ba529..956071d3dc 100644 --- a/mteb/models/wrapper.py +++ b/mteb/models/wrapper.py @@ -3,12 +3,9 @@ import logging from typing import Callable, get_args -import numpy as np - import mteb from mteb.abstasks.TaskMetadata import TASK_TYPE from mteb.encoder_interface import PromptType -from mteb.evaluation.evaluators.utils import cos_sim, dot_score logger = logging.getLogger(__name__) @@ -67,18 +64,6 @@ def get_prompt_name( ) return None - @staticmethod - def get_similarity_function( - similarity_fn_name: str, - ) -> Callable[[np.ndarray, np.ndarray], np.ndarray]: - if similarity_fn_name == "cosine": - return cos_sim - if similarity_fn_name == "dot": - return dot_score - raise ValueError( - "Invalid similarity function. Should be one of ['cosine', 'dot']" - ) - @staticmethod def validate_task_to_prompt_name( task_to_prompt_name: dict[str, str] | None, diff --git a/mteb/overview.py b/mteb/overview.py index 5846993b02..31bc5130e8 100644 --- a/mteb/overview.py +++ b/mteb/overview.py @@ -124,13 +124,11 @@ def __repr__(self) -> str: return "MTEBTasks" + super().__repr__() @staticmethod - def _extract_property_from_task(task, property): + def _extract_property_from_task(task: AbsTask, property: str): if hasattr(task.metadata, property): return getattr(task.metadata, property) elif hasattr(task, property): return getattr(task, property) - elif property in task.metadata_dict: - return task.metadata_dict[property] else: raise KeyError("Property neither in Task attribute or in task metadata.") diff --git a/pyproject.toml b/pyproject.toml index 86c250c7f4..f42014e3a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.0" +version = "1.29.16" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ @@ -64,7 +64,7 @@ docs = [ speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] -leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8"] +leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8", "plotly>=5.24.0"] flagembedding = ["FlagEmbedding"] jina = ["einops>=0.8.0"] flash_attention = ["flash-attn>=2.6.3"] diff --git a/scripts/compare_leaderboard_results.py b/scripts/compare_leaderboard_results.py index bbeb912bb4..1fe9c3d766 100644 --- a/scripts/compare_leaderboard_results.py +++ b/scripts/compare_leaderboard_results.py @@ -2,70 +2,84 @@ import json import logging +from collections import defaultdict from pathlib import Path -from mteb import MTEB_ENG_CLASSIC, load_results +from mteb import get_benchmark, load_results logging.basicConfig(level=logging.INFO) models = [ - "dunzhang/stella_en_1.5B_v5", - "dunzhang/stella_en_400M_v5", + "intfloat/multilingual-e5-small", # Add other models here ] +benchmark = get_benchmark("MTEB(Chinese)") + +results = [] # in same folder as mteb repo # git clone https://github.com/embeddings-benchmark/leaderboard -data_tasks_path = Path("../../leaderboard/boards_data/en/data_tasks/") +# get path of current file +base_path = Path(__file__).parent.parent.parent / "leaderboard" / "boards_data" -results = [] for model_name_to_search in models: model_results = load_results( models=[model_name_to_search], - tasks=MTEB_ENG_CLASSIC.tasks, + tasks=benchmark.tasks, only_main_score=True, + require_model_meta=False, ) - cur_model = {} + cur_model = {task.metadata.name: defaultdict(dict) for task in benchmark.tasks} for model_res in model_results: for task_res in model_res.task_results: task_name = task_res.task.metadata.name - split = "test" if task_name != "MSMARCO" else "dev" - scores = [score["main_score"] for score in task_res.scores[split]] - # this tmp solution, because some tasks have multiple results - cur_model[task_name] = {"new": round((sum(scores) / len(scores)) * 100, 2)} - for task_dir in data_tasks_path.iterdir(): - if task_dir.is_dir(): - results_file_path = task_dir / "default.jsonl" - if results_file_path.exists(): - with open(results_file_path) as file: - for line in file: - data = json.loads(line) - model_name = data.get("Model", "") - if model_name_to_search in model_name: - for key, value in data.items(): - if key in [ - "index", - "Rank", - "Model", - "Model Size (Million Parameters)", - "Memory Usage (GB, fp32)", - "Embedding Dimensions", - "Max Tokens", - "Average", - ]: - continue - for benchmark_task in MTEB_ENG_CLASSIC.tasks: - if benchmark_task.metadata.name in key: - cur_model[benchmark_task.metadata.name][ - "old" - ] = value + split = ( + "test" + if "test" in task_res.task.metadata.eval_splits + else task_res.task.metadata.eval_splits[0] + ) + if split in task_res.scores: + scores = [score["main_score"] for score in task_res.scores[split]] + cur_model[task_name]["new"] = round( + (sum(scores) / len(scores)) * 100, 2 + ) + + for lang_path in base_path.iterdir(): + data_tasks_path = lang_path / "data_tasks" + + for task_dir in data_tasks_path.iterdir(): + if task_dir.is_dir(): + results_file_path = task_dir / "default.jsonl" + if results_file_path.exists(): + with open(results_file_path) as file: + for line in file: + data = json.loads(line) + model_name = data.get("Model", "") + if model_name_to_search in model_name: + for key, value in data.items(): + if key in [ + "index", + "Rank", + "Model", + "Model Size (Million Parameters)", + "Memory Usage (GB, fp32)", + "Embedding Dimensions", + "Max Tokens", + "Average", + ]: + continue + for benchmark_task in benchmark.tasks: + if benchmark_task.metadata.name in key: + cur_model[benchmark_task.metadata.name][ + "old" + ] = value sorted_cur_model = { task.metadata.name: cur_model[task.metadata.name] - for task in MTEB_ENG_CLASSIC.tasks + for task in benchmark.tasks if task.metadata.name in cur_model } results.append({"model": model_name_to_search, "results": sorted_cur_model}) diff --git a/scripts/data/create_task_table.py b/scripts/data/create_task_table.py index e5b292a08a..e15edb4820 100644 --- a/scripts/data/create_task_table.py +++ b/scripts/data/create_task_table.py @@ -137,14 +137,9 @@ def get_ds_stats(hf_hub_name): # Select all tasks for task in MTEB().tasks: print("Task: ", task) - if "dataset" in task.metadata_dict: - hub_name = hub_url = task.metadata.dataset["path"] - ds_stats = get_ds_stats(hub_name.split("/")[-1]) - elif "beir_name" in task.metadata_dict: - hub_name = hub_url = "BeIR/" + task.metadata_dict.get("beir_name") - ds_stats = get_ds_stats_beir("/".join(hub_name.split("/")[1:])) - if "cqadupstack" in hub_name: - hub_url = "BeIR/cqadupstack-qrels" + hub_name = hub_url = task.metadata.dataset["path"] + ds_stats = get_ds_stats(hub_name.split("/")[-1]) + TABLE_STRING += "\n" + ONE_LINE.format( f"[{task.metadata.name}]({task.metadata.reference})", f"[{hub_name}](https://huggingface.co/datasets/{hub_url})", diff --git a/scripts/extract_model_names.py b/scripts/extract_model_names.py index ba1bc1a8b0..6cbaa2c298 100644 --- a/scripts/extract_model_names.py +++ b/scripts/extract_model_names.py @@ -1,11 +1,14 @@ from __future__ import annotations +import argparse import ast -import sys +import logging from pathlib import Path from git import Repo +logging.basicConfig(level=logging.INFO) + def get_changed_files(base_branch="main"): repo_path = Path(__file__).parent.parent @@ -28,8 +31,11 @@ def get_changed_files(base_branch="main"): ] -def extract_model_names(files: list[str]) -> list[str]: +def extract_model_names( + files: list[str], return_one_model_name_per_file=False +) -> list[str]: model_names = [] + first_model_found = False for file in files: with open(file) as f: tree = ast.parse(f.read()) @@ -52,17 +58,44 @@ def extract_model_names(files: list[str]) -> list[str]: ) if model_name: model_names.append(model_name) + first_model_found = True + if return_one_model_name_per_file and first_model_found: + logging.info(f"Found model name {model_name} in file {file}") + break # NOTE: Only take the first model_name per file to avoid disk out of space issue. return model_names +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "base_branch", + nargs="?", + default="main", + help="Base branch to compare changes with", + ) + parser.add_argument( + "--return_one_model_name_per_file", + action="store_true", + default=False, + help="Only return one model name per file.", + ) + return parser.parse_args() + + if __name__ == "__main__": """ Can pass in base branch as an argument. Defaults to 'main'. e.g. python extract_model_names.py mieb """ - base_branch = sys.argv[1] if len(sys.argv) > 1 else "main" + + args = parse_args() + + base_branch = args.base_branch changed_files = get_changed_files(base_branch) - model_names = extract_model_names(changed_files) + model_names = extract_model_names( + changed_files, + return_one_model_name_per_file=args.return_one_model_name_per_file, + ) output_file = Path(__file__).parent / "model_names.txt" with output_file.open("w") as f: f.write(" ".join(model_names)) diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index a96604446e..4ae87fdbca 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -220,7 +220,6 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: license=card_data.get("license", None), framework=frameworks, n_parameters=n_parameters, - public_training_data=bool(datasets), adapted_from=get_base_model(model_name), training_datasets=training_datasets, open_weights=True, @@ -237,6 +236,17 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: revision=None, languages=None, release_date=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + frameworks=[], ) diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py index cedf393210..99c24b756f 100644 --- a/tests/test_benchmark/mock_tasks.py +++ b/tests/test_benchmark/mock_tasks.py @@ -958,12 +958,10 @@ def load_data(self, **kwargs): ), } - self.dataset = DatasetDict( - { - "eng": data, - "fra": data, - } - ) + self.dataset = {} + for lang in self.hf_subsets: + self.dataset[lang] = data + self.data_loaded = True min_score = 0 diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py index 8ae310555f..3ad484b6ff 100644 --- a/tests/test_benchmark/task_grid.py +++ b/tests/test_benchmark/task_grid.py @@ -2,14 +2,8 @@ from __future__ import annotations +import mteb from mteb.abstasks import AbsTask -from mteb.tasks.BitextMining.dan.BornholmskBitextMining import BornholmBitextMining -from mteb.tasks.Classification.multilingual.IndicSentimentClassification import ( - IndicSentimentClassification, -) -from mteb.tasks.Clustering.eng.TwentyNewsgroupsClustering import ( - TwentyNewsgroupsClusteringFast, -) from .mock_tasks import ( MockBitextMiningTask, @@ -39,31 +33,25 @@ MockSummarizationTask, ) -twenty_news = TwentyNewsgroupsClusteringFast() - -# downsample to speed up tests -twenty_news.max_document_to_embed = 1000 -twenty_news.n_clusters = 2 -twenty_news.max_fraction_of_documents_to_embed = None - -TASK_TEST_GRID = [ - BornholmBitextMining(), # bitext mining + just supplying a task class instead of a string - IndicSentimentClassification( # multi subset loader - hf_subsets=["as"], # we only load one subset here to speed up tests - n_experiments=2, # to speed up the test - ), - "TwentyNewsgroupsClustering", # clustering and string instead of class - twenty_news, # fast clustering - "Banking77Classification", # classification - "SciDocsRR", # reranking - "FarsTail", # pair classification - "TwitterHjerneRetrieval", # retrieval - "BrazilianToxicTweetsClassification", # multilabel classification - "FaroeseSTS", # STS - "SummEval", # summarization - "Core17InstructionRetrieval", # instruction reranking - "InstructIR", # instruction retrieval -] +TASK_TEST_GRID = ( + mteb.get_tasks( + tasks=[ + "BornholmBitextMining", # bitext mining + just supplying a task class instead of a string + "TwentyNewsgroupsClustering", # clustering and string instead of class + "TwentyNewsgroupsClustering.v2", # fast clustering + "Banking77Classification", # classification + "SciDocsRR", # reranking + "FarsTail", # pair classification + "TwitterHjerneRetrieval", # retrieval + "BrazilianToxicTweetsClassification", # multilabel classification + "FaroeseSTS", # STS + "SummEval", # summarization + "Core17InstructionRetrieval", # instruction reranking + "InstructIR", # instruction retrieval + ] + ) + + mteb.get_tasks(tasks=["IndicSentimentClassification"], languages=["asm-Beng"]) +) TASK_TEST_GRID_AS_STRING = [ t.metadata.name if isinstance(t, AbsTask) else t for t in TASK_TEST_GRID diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index 0c8521578d..37a226f737 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -41,7 +41,7 @@ def test_mulitple_mteb_tasks(tasks: list[AbsTask], model: mteb.Encoder, tmp_path: Path): """Test that multiple tasks can be run""" eval = mteb.MTEB(tasks=tasks) - eval.run(model, output_folder=str(tmp_path), overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) # ensure that we can generate a readme from the output folder generate_readme(tmp_path) @@ -56,7 +56,9 @@ def test_mulitple_mteb_tasks(tasks: list[AbsTask], model: mteb.Encoder, tmp_path MockTorchbf16Encoder(), ], ) -def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder): +def test_benchmark_encoders_on_task( + task: str | AbsTask, model: mteb.Encoder, tmp_path: Path +): """Test that a task can be fetched and run using a variety of encoders""" if isinstance(task, str): tasks = mteb.get_tasks(tasks=[task]) @@ -64,15 +66,17 @@ def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder): tasks = [task] eval = mteb.MTEB(tasks=tasks) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix()) -@pytest.mark.parametrize("task", [MockMultilingualRetrievalTask]) +@pytest.mark.parametrize("task", [MockMultilingualRetrievalTask()]) @pytest.mark.parametrize( "model", [MockSentenceTransformer()], ) -def test_run_eval_without_co2_tracking(task: str | AbsTask, model: mteb.Encoder): +def test_run_eval_without_co2_tracking( + task: str | AbsTask, model: mteb.Encoder, tmp_path: Path +): """Test that a task can be fetched and run without CO2 tracking""" if isinstance(task, str): tasks = mteb.get_tasks(tasks=[task]) @@ -80,9 +84,7 @@ def test_run_eval_without_co2_tracking(task: str | AbsTask, model: mteb.Encoder) tasks = [task] eval = mteb.MTEB(tasks=tasks) - eval.run( - model, output_folder="tests/results", overwrite_results=True, co2_tracker=False - ) + eval.run(model, output_folder=tmp_path.as_posix(), co2_tracker=False) @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID[:1]) @@ -95,20 +97,22 @@ def test_reload_results(task: str | AbsTask, model: mteb.Encoder, tmp_path: Path tasks = [task] eval = mteb.MTEB(tasks=tasks) - results = eval.run(model, output_folder=str(tmp_path), overwrite_results=True) + results = eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) assert isinstance(results, list) assert isinstance(results[0], mteb.TaskResult) # reload the results - results = eval.run(model, output_folder=str(tmp_path), overwrite_results=False) + results = eval.run( + model, output_folder=tmp_path.as_posix(), overwrite_results=False + ) assert isinstance(results, list) assert isinstance(results[0], mteb.TaskResult) @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID) -def test_prompt_name_passed_to_all_encodes(task_name: str | AbsTask): +def test_prompt_name_passed_to_all_encodes(task_name: str | AbsTask, tmp_path: Path): """Test that all tasks correctly pass down the prompt_name to the encoder which supports it, and that the encoder which does not support it does not receive it. """ @@ -141,17 +145,17 @@ def encode(self, sentences, **kwargs): eval.run( model, - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, ) # Test that the task_name is not passed down to the encoder model = EncoderWithoutInstructions("average_word_embeddings_levy_dependency") assert model.prompts == {}, "The encoder should not have any prompts" - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID) -def test_encode_kwargs_passed_to_all_encodes(task_name: str | AbsTask): +def test_encode_kwargs_passed_to_all_encodes(task_name: str | AbsTask, tmp_path: Path): """Test that all tasks correctly pass down the encode_kwargs to the encoder.""" my_encode_kwargs = {"no_one_uses_this_args": "but_its_here"} @@ -175,27 +179,27 @@ def encode(self, sentences, task_name: str | None = None, **kwargs): model = MockEncoderWithKwargs() eval.run( model, - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, encode_kwargs=my_encode_kwargs, ) @pytest.mark.parametrize("model", [MockNumpyEncoder()]) -def test_run_using_benchmark(model: mteb.Encoder): +def test_run_using_benchmark(model: mteb.Encoder, tmp_path: Path): """Test that a benchmark object can be run using the MTEB class.""" bench = Benchmark( name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]) ) - eval = mteb.MTEB(tasks=bench) + eval = mteb.MTEB(tasks=[bench]) eval.run( - model, output_folder="tests/results", overwrite_results=True + model, output_folder=tmp_path.as_posix(), overwrite_results=True ) # we just want to test that it runs @pytest.mark.parametrize("model", [MockNumpyEncoder()]) -def test_run_using_list_of_benchmark(model: mteb.Encoder): +def test_run_using_list_of_benchmark(model: mteb.Encoder, tmp_path: Path): """Test that a list of benchmark objects can be run using the MTEB class.""" bench = [ Benchmark(name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"])) @@ -203,7 +207,7 @@ def test_run_using_list_of_benchmark(model: mteb.Encoder): eval = mteb.MTEB(tasks=bench) eval.run( - model, output_folder="tests/results", overwrite_results=True + model, output_folder=tmp_path.as_posix() ) # we just want to test that it runs @@ -229,7 +233,7 @@ def test_get_benchmark(name): @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID) @pytest.mark.parametrize("is_task_name", [True, False]) def test_prompt_name_passed_to_all_encodes_with_prompts( - task: AbsTask | str, is_task_name: bool + task: AbsTask | str, is_task_name: bool, tmp_path: Path ): """Test that all tasks and task_types correctly pass down the prompt_name to the encoder with prompts.""" _task_name = task.metadata.name if isinstance(task, AbsTask) else task @@ -258,8 +262,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): ) eval.run( model, - output_folder="tests/results", - overwrite_results=True, + output_folder=tmp_path.as_posix(), ) class MockEncoderWithExistingPrompts(mteb.Encoder): @@ -275,7 +278,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): model = MockSentenceTransformerWrapper(MockEncoderWithExistingPrompts()) eval.run( model, - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, ) @@ -292,7 +295,9 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): ], ) @pytest.mark.parametrize("is_task_name", [True, False]) -def test_model_query_passage_prompts_task_type(task: AbsTask | str, is_task_name: bool): +def test_model_query_passage_prompts_task_type( + task: AbsTask | str, is_task_name: bool, tmp_path: Path +): """Test that the model with prompts is correctly called.""" tasks = [task] @@ -331,8 +336,7 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs): eval.run( model, model_prompts=prompt_list, - output_folder="tests/results", - overwrite_results=True, + output_folder=tmp_path.as_posix(), ) model = MockSentenceTransformerWrapper( MockSentenceEncoderWithPrompts(), model_prompts=prompt_list @@ -341,6 +345,5 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs): eval.run( model, model_prompts=prompt_list, - output_folder="tests/results", - overwrite_results=True, + output_folder=tmp_path.as_posix(), ) diff --git a/tests/test_benchmark/test_benchmark_integration_with_datasets.py b/tests/test_benchmark/test_benchmark_integration_with_datasets.py index 81d4c6b676..8288680c3c 100644 --- a/tests/test_benchmark/test_benchmark_integration_with_datasets.py +++ b/tests/test_benchmark/test_benchmark_integration_with_datasets.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +from pathlib import Path import pytest @@ -18,7 +19,7 @@ @pytest.mark.parametrize("task", TASK_TEST_GRID) @pytest.mark.parametrize("model", [MockNumpyEncoder()]) -def test_benchmark_datasets(task: str | AbsTask, model: mteb.Encoder): +def test_benchmark_datasets(task: str | AbsTask, model: mteb.Encoder, tmp_path: Path): """Test that a task can be fetched and run""" eval = MTEB(tasks=[task]) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) diff --git a/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py b/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py index 4ca0056cd7..e79515be56 100644 --- a/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py +++ b/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +from pathlib import Path import pytest from sentence_transformers import SentenceTransformer @@ -22,9 +23,11 @@ "average_word_embeddings_levy_dependency", ], ) -def test_benchmark_sentence_transformer(task: str | AbsTask, model_name: str): +def test_benchmark_sentence_transformer( + task: str | AbsTask, model_name: str, tmp_path: Path +): """Test that a task can be fetched and run""" if isinstance(model_name, str): model = SentenceTransformer(model_name) eval = MTEB(tasks=[task]) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) diff --git a/tests/test_benchmark/test_models.py b/tests/test_benchmark/test_models.py new file mode 100644 index 0000000000..5d6cc1a022 --- /dev/null +++ b/tests/test_benchmark/test_models.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +import mteb +from mteb import MTEB +from mteb.abstasks import AbsTask + +from .mock_tasks import MockRetrievalTask + + +@pytest.mark.skipif(sys.version_info < (3, 10), reason="Requires Python 3.10 or higher") +@pytest.mark.parametrize("model", ["colbert-ir/colbertv2.0"]) +@pytest.mark.parametrize("task", [MockRetrievalTask()]) +def test_colbert_model_e2e(task: AbsTask, model: str, tmp_path: Path): + pytest.importorskip("pylate", reason="pylate not installed") + eval_splits = ["test"] + model = mteb.get_model(model) + evaluation = MTEB(tasks=[task]) + + results = evaluation.run( + model, + eval_splits=eval_splits, + corpus_chunk_size=500, + output_folder=tmp_path.as_posix(), + ) + result = results[0] + + assert result.scores["test"][0]["ndcg_at_1"] == 1.0 + + +def test_bm25s_e2e(tmp_path: Path): + # fails for dataset smaller then 1000 + pytest.importorskip("bm25s", reason="bm25s not installed") + pytest.importorskip("Stemmer", reason="PyStemmer not installed") + + model = mteb.get_model("bm25s") + tasks = mteb.get_tasks(tasks=["NFCorpus"]) + eval_splits = ["test"] + + evaluation = MTEB(tasks=tasks) + + results = evaluation.run( + model, eval_splits=eval_splits, output_folder=tmp_path.as_posix() + ) + result = results[0] + + assert result.scores["test"][0]["ndcg_at_1"] == 0.42879 diff --git a/tests/test_cli.py b/tests/test_cli.py index 7c71528f0d..fc4a468112 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -50,12 +50,13 @@ def test_run_task( model_name: str, task_name: str, model_revision: str, + tmp_path: Path, ): args = Namespace( model=model_name, tasks=[task_name], model_revision=model_revision, - output_folder="tests/results/test_model", + output_folder=tmp_path.as_posix(), verbosity=3, device=None, categories=None, @@ -71,9 +72,7 @@ def test_run_task( run(args) model_name_as_path = model_name.replace("/", "__").replace(" ", "_") - results_path = Path( - f"tests/results/test_model/{model_name_as_path}/{model_revision}" - ) + results_path = tmp_path / model_name_as_path / model_revision assert results_path.exists(), "Output folder not created" assert "model_meta.json" in [ f.name for f in list(results_path.glob("*.json")) @@ -122,7 +121,7 @@ def test_create_meta(): ), f"Value for {key} does not match" # ensure that the command line interface works as well - command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --overwrite" + command = f"{sys.executable} -m mteb create_meta --results_folder {results.as_posix()} --output_path {output_path.as_posix()} --overwrite" result = subprocess.run(command, shell=True, capture_output=True, text=True) assert result.returncode == 0, "Command failed" @@ -134,14 +133,16 @@ def test_create_meta(): ("model_card_without_frontmatter.md", "model_card_gold_without_frontmatter.md"), ], ) -def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name: str): +def test_create_meta_from_existing( + existing_readme_name: str, gold_readme_name: str, tmp_path: Path +): """Test create_meta function directly as well as through the command line interface""" test_folder = Path(__file__).parent output_folder = test_folder / "create_meta" results = ( output_folder / "all-MiniLM-L6-v2" / "8b3219a92973c328a8e22fadcfa821b5dc75636a" ) - output_path = output_folder / "model_card.md" + output_path = tmp_path / "model_card.md" existing_readme = output_folder / existing_readme_name args = Namespace( @@ -183,7 +184,7 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name: ), f"Value for {key} does not match" assert readme_output == gold_readme # ensure that the command line interface works as well - command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --from_existing {existing_readme} --overwrite" + command = f"{sys.executable} -m mteb create_meta --results_folder {results.as_posix()} --output_path {output_path.as_posix()} --from_existing {existing_readme.as_posix()} --overwrite" result = subprocess.run(command, shell=True, capture_output=True, text=True) assert result.returncode == 0, "Command failed" diff --git a/tests/test_evaluation/test_split_evaluation.py b/tests/test_evaluation/test_split_evaluation.py index 7ce3f512e0..7db10e09d4 100644 --- a/tests/test_evaluation/test_split_evaluation.py +++ b/tests/test_evaluation/test_split_evaluation.py @@ -8,6 +8,7 @@ ) from tests.test_benchmark.mock_tasks import ( MockMultilingualRetrievalTask, + MockMultilingualSTSTask, MockRetrievalTask, ) @@ -362,3 +363,18 @@ def test_all_splits_subsets_evaluated_with_overwrite( for split in ["test"]: assert len(results2[0].scores[split]) == 2 assert sorted(results2[0].languages) == ["eng", "fra"] + + +def test_splits_evaluated_with_prefiltering(): + """Test that the evaluation only runs on the specified languages. Issue https://github.com/embeddings-benchmark/mteb/pull/1787#issuecomment-2598205049""" + task = MockMultilingualSTSTask().filter_languages(languages=["fra"]) + + evaluation = MTEB(tasks=[task]) + + results = evaluation.run(MockSentenceTransformer(), overwrite_results=True) + result_scores = results[0].scores + + assert len(result_scores) == 1 + assert "test" in result_scores + assert len(result_scores["test"]) == 1 + assert result_scores["test"][0]["hf_subset"] == "fra" diff --git a/tests/test_model_meta/test_model_meta.py b/tests/test_model_meta/test_model_meta.py new file mode 100644 index 0000000000..2d23bc66cb --- /dev/null +++ b/tests/test_model_meta/test_model_meta.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest +from sentence_transformers import CrossEncoder, SentenceTransformer + +from mteb import MTEB +from mteb.abstasks import AbsTask +from tests.test_benchmark.mock_tasks import MockRetrievalTask + + +def test_create_model_meta_from_sentence_transformers(): + model_name = "sentence-transformers/average_word_embeddings_levy_dependency" + revision = "6d9c09a789ad5dd126b476323fccfeeafcd90509" + model = SentenceTransformer(model_name, revision=revision) + + meta = MTEB.create_model_meta(model) + + assert meta.similarity_fn_name == "cosine" + assert meta.embed_dim == model.get_sentence_embedding_dimension() + assert type(meta.framework) is list + assert meta.framework[0] == "Sentence Transformers" + assert meta.name == model_name + assert meta.revision == revision + + +def test_create_model_meta_from_cross_encoder(): + model_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2" + revision = "841d331b6f34b15d6ac0ab366ae3a3b36eeac691" + model = CrossEncoder(model_name, revision=revision) + + meta = MTEB.create_model_meta(model) + + assert meta.name == model_name + assert meta.revision == revision + + return meta + + +@pytest.mark.parametrize("task", [MockRetrievalTask()]) +def test_output_folder_model_meta(task: AbsTask, tmp_path: Path): + mteb = MTEB(tasks=[task]) + model_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2" + model = CrossEncoder(model_name) + meta = mteb.create_model_meta(model) + output_path = mteb.create_output_folder( + model_meta=meta, output_folder=tmp_path.as_posix() + ) + + output_path = Path(output_path) + assert output_path.exists() + assert output_path.is_dir() + assert output_path.name == model.config._commit_hash + assert output_path.parent.name == "cross-encoder__ms-marco-TinyBERT-L-2-v2" + assert output_path.parent.parent == tmp_path + + +@pytest.mark.skipif(sys.version_info < (3, 10), reason="Requires Python 3.10 or higher") +def test_model_meta_colbert(): + model_name = "colbert-ir/colbertv2.0" + colbert_model = pytest.importorskip("pylate.models", reason="pylate not installed") + revision = "c1e84128e85ef755c096a95bdb06b47793b13acf" + model = colbert_model.ColBERT(model_name, revision=revision) + + meta = MTEB.create_model_meta(model) + + # assert meta.similarity_fn_name == "MaxSim" test with new release of pylate + assert type(meta.framework) is list + assert meta.framework[0] == "Sentence Transformers" + assert meta.name == model_name + assert meta.revision == revision diff --git a/tests/test_overview.py b/tests/test_overview.py index 127e54f279..6136af1ea5 100644 --- a/tests/test_overview.py +++ b/tests/test_overview.py @@ -98,8 +98,3 @@ def test_MTEBTasks( # check for header of a table n_langs = len(tasks) assert len(tasks.to_markdown().split("\n")) - 3 == n_langs - - -def test_all_tasks_fetch(): - """Test that all tasks can be fetched""" - mteb.MTEB.mteb_tasks() diff --git a/tests/test_reproducible_workflow.py b/tests/test_reproducible_workflow.py index 566864a112..1973072bab 100644 --- a/tests/test_reproducible_workflow.py +++ b/tests/test_reproducible_workflow.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from pathlib import Path import pytest @@ -18,7 +19,9 @@ @pytest.mark.parametrize("task_name", ["BornholmBitextMining"]) @pytest.mark.parametrize("model_name", ["sentence-transformers/all-MiniLM-L6-v2"]) @pytest.mark.parametrize("model_revision", ["8b3219a92973c328a8e22fadcfa821b5dc75636a"]) -def test_reproducibility_workflow(task_name: str, model_name: str, model_revision: str): +def test_reproducibility_workflow( + task_name: str, model_name: str, model_revision: str, tmp_path: Path +): """Test that a model and a task can be fetched and run in a reproducible fashion.""" model_meta = mteb.get_model_meta(model_name, revision=model_revision) task = mteb.get_task(task_name) @@ -30,13 +33,13 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio assert isinstance(model, Encoder) eval = MTEB(tasks=[task]) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) @pytest.mark.parametrize( "task_name", TASK_TEST_GRID - + [ + + ( "BitextMining", "Classification", "MultilabelClassification", @@ -49,7 +52,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio "InstructionRetrieval", "InstructionReranking", "Speed", - ], + ), ) def test_validate_task_to_prompt_name(task_name: str | AbsTask): if isinstance(task_name, AbsTask): diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index af66133273..91a7b95070 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -8,20 +8,17 @@ import pytest import mteb -from mteb import MTEB from mteb.abstasks import AbsTask, MultilingualTask from mteb.abstasks.AbsTaskReranking import AbsTaskReranking from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.AbsTaskSpeedTask import AbsTaskSpeedTask -from mteb.overview import TASKS_REGISTRY +from mteb.overview import TASKS_REGISTRY, get_tasks from ..test_benchmark.task_grid import MOCK_TASK_TEST_GRID_AS_STRING logging.basicConfig(level=logging.INFO) -tasks = [ - t for t in MTEB().tasks_cls if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING -] +tasks = [t for t in get_tasks() if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING] @pytest.mark.parametrize("task", tasks) @@ -84,7 +81,7 @@ async def check_datasets_are_available_on_hf(tasks): def test_dataset_availability(): """Checks if the datasets are available on Hugging Face using both their name and revision.""" - tasks = MTEB().tasks_cls + tasks = get_tasks() tasks = [ t for t in tasks diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index c540bb41ee..7705de4d3f 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -6,6 +6,7 @@ from sentence_transformers import CrossEncoder, SentenceTransformer +import mteb from mteb import MTEB from mteb.model_meta import ModelMeta @@ -318,11 +319,7 @@ def test_mteb_rerank(tmp_path: Path): "1395", ] model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2") - eval = MTEB( - tasks=[ - "SciFact", - ] - ) + eval = MTEB(tasks=mteb.get_tasks(["SciFact"])) # create fake first stage results tmp_file = tmp_path / "tmp.json" with open(tmp_file, "w") as f: @@ -342,17 +339,16 @@ def test_mteb_rerank(tmp_path: Path): eval.run( model, # type: ignore - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, eval_splits=["test"], top_k=2, previous_results=tmp_file, save_predictions=True, ) - tmp_file.unlink() # read in the results - with open("tests/results/SciFact_default_predictions.json") as f: + with (tmp_path / "SciFact_default_predictions.json").open() as f: results = json.load(f) # check that only the top two results are re-orderd @@ -361,7 +357,7 @@ def test_mteb_rerank(tmp_path: Path): assert "18670" in results["1"] -def test_reranker_same_ndcg1(): +def test_reranker_same_ndcg1(tmp_path: Path): de_name = "average_word_embeddings_komninos" revision = "21eec43590414cb8e3a6f654857abed0483ae36e" de = SentenceTransformer(de_name, revision=revision) @@ -373,34 +369,48 @@ def test_reranker_same_ndcg1(): open_weights=True, revision=ce_revision, release_date="2021-04-15", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + public_training_data=None, + reference=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) - eval = MTEB(tasks=["SciFact"]) + eval = MTEB(tasks=mteb.get_tasks(["SciFact"])) + stage1_path = tmp_path / "stage1" eval.run( de, - output_folder="tests/results/stage1", + output_folder=stage1_path.as_posix(), overwrite_results=True, save_predictions=True, eval_splits=["test"], ) + stage2_path = tmp_path / "stage2" eval.run( ce, # type: ignore - output_folder="tests/results/stage2", + output_folder=stage2_path.as_posix(), overwrite_results=True, - previous_results="tests/results/stage1/SciFact_default_predictions.json", + previous_results=(stage1_path / "SciFact_default_predictions.json"), save_predictions=False, eval_splits=["test"], top_k=1, # don't allow it to rerank more than 1 so we can check for top_1 being the same ) # read in stage 1 and stage two and check ndcg@1 is the same - with open( - f"tests/results/stage1/sentence-transformers__{de_name}/{revision}/SciFact.json" - ) as f: + with ( + stage1_path / f"sentence-transformers__{de_name}/{revision}/SciFact.json" + ).open() as f: stage1 = json.load(f) - with open( - f"tests/results/stage2/cross-encoder__ms-marco-TinyBERT-L-2-v2/{ce_revision}/SciFact.json" - ) as f: + with ( + stage2_path + / f"cross-encoder__ms-marco-TinyBERT-L-2-v2/{ce_revision}/SciFact.json" + ).open() as f: stage2 = json.load(f) assert (