diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 000000000..59c2ee387
Binary files /dev/null and b/.DS_Store differ
diff --git a/LICENSE b/LICENSE
index 261eeb9e9..1490794b8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -178,7 +178,7 @@
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
+ boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
@@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.
- Copyright [yyyy] [name of copyright owner]
+ Copyright 2023 Alibaba
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -199,3 +199,202 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
+
+-------------------------------------------------------------------------------
+
+Code in data_juicer/ops/common/helper_func.py, data_juicer/ops/deduplicator/document_deduplicator.py,
+data_juicer/ops/deduplicator/document_simhash_deduplicator.py, data_juicer/ops/filter/character_repetition_filter.py,
+data_juicer/ops/filter/flagged_words_filter.py, data_juicer/ops/filter/perplexity_filter.py,
+data_juicer/ops/filter/special_characters_filter.py, data_juicer/ops/filter/stopwords_filter.py,
+data_juicer/ops/filter/word_repetition_filter.py, data_juicer/ops/mapper/punctuation_normalization_mapper.py,
+data_juicer/ops/mapper/remove_long_words_mapper.py, app.py is adapted from
+https://huggingface.co/spaces/huggingface/text-data-filtering or
+https://github.com/bigscience-workshop/data-preparation
+
+ Copyright [2021] [Bigscience]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-------------------------------------------------------------------------------
+
+Code in data_juicer/ops/deduplicator/document_minhash_deduplicator.py is
+adapted from
+https://github.com/bigcode-project/bigcode-dataset
+
+ Copyright 2022 bigcode authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-------------------------------------------------------------------------------
+
+Code in data_juicer/ops/mapper/clean_copyright_mapper.py, data_juicer/ops/mapper/clean_html_mapper.py,
+data_juicer/ops/mapper/expand_macro_mapper.py, data_juicer/ops/mapper/remove_bibliography_mapper.py,
+data_juicer/ops/mapper/remove_comments_mapper.py, data_juicer/ops/mapper/remove_header_mapper.py,
+is adapted from
+https://github.com/togethercomputer/RedPajama-Data
+
+ Copyright 2023 RedPajama authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-------------------------------------------------------------------------------
+
+The implementations of gpt_evaluator in tools/evaluator/gpt_eval/gpt_evaluator.py
+is adapted from https://github.com/lm-sys/FastChat (Apache License)
+
+Copyright (c) 2023 The FastChat Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+-----------------------------------------------------
+
+
+The implementations of checkpoint converter in tools/converter/
+convert_gpt_to_transformers.py and tools/converter/modeling_megatron_llama.py
+are adapted from https://github.com/huggingface/transformers (Apache License)
+
+Copyright (c) 2022 EleutherAI and the HuggingFace Inc. team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+-----------------------------------------------------
+
+Code in thirdparty/Megatron-LM
+is adapted from https://github.com/NVIDIA/Megatron-LM
+
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-----------------------------------------------------
+
+Code in thirdparty/helm
+is adapted from https://github.com/stanford-crfm/helm (Apache License)
+
+Copyright (c) 2023 The helm Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+-----------------------------------------------------
+
+Code in tests/run.py is adapted from https://github
+.com/alibaba/FederatedScope/blob/master/tests/run.py (Apache License)
+
+Copyright (c) 2023 The FederatedScope Team
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+-----------------------------------------------------
+
+Code in utils/logger_utils.py is adapted from https://github.com/MegEngine/
+YOLOX/blob/main/yolox/utils/logger.py (Apache License)
+
+Copyright 2021 Megvii, Base Detection
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
diff --git a/README.md b/README.md
index 8e498944f..c2af1a99f 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,214 @@
-# data-juicer
\ No newline at end of file
+# Data-Juicer: A Data-Centric Text Processing System for Large Language Models
+
+
+
+
+
+[](docs/DeveloperGuide.md)
+
+[](#documentation-|-文档)
+[](README_ZH.md)
+[](https://alibaba.github.io/data-juicer/)
+[](#demos)
+[](https://modelscope.cn/datasets?organization=Data-Juicer&page=1)
+
+[](tools/quality_classifier/README.md)
+[](tools/evaluator/README.md)
+
+Data-Juicer is a data-centric text processing system to make data higher-quality, juicier, and more digestible for LLMs.
+This project is being actively updated and maintained, and we will periodically enhance and add more features and data recipes. We welcome you to join us in promoting LLM data development and research!
+
+----
+
+Table of Contents
+=================
+
+* [Data-Juicer: A Data-Centric Text Processing System for Large Language Models](#data-juicer-a-data-centric-text-processing-system-for-large-language-models)
+* [Table of Contents](#table-of-contents)
+ * [Features](#features)
+ * [Prerequisites](#prerequisites)
+ * [Installation](#installation)
+ * [Quick Start](#quick-start)
+ * [Data Processing](#data-processing)
+ * [Data Analysis](#data-analysis)
+ * [Data Visualization](#data-visualization)
+ * [Build Up Config Files](#build-up-config-files)
+ * [Preprocess raw data (Optional)](#preprocess-raw-data-optional)
+ * [Documentation | 文档](#documentation-|-文档)
+ * [Data Recipes](#data-recipes)
+ * [Demos](#demos)
+ * [License](#license)
+ * [Contributing](#contributing)
+ * [References](#references)
+
+## Features
+
+- **Broad Range of Operators**: Equipped with 50+ core [operators (OPs)](docs/Operators.md), including Formatters, Mappers, Filters, Deduplicators, and beyond.
+
+- **Specialized Toolkits**: Feature-rich specialized toolkits such as [Text Quality Classifier](tools/quality_classifier/README.md), [Dataset Splitter](tools/preprocess/README.md), [Analysers](#data-analysis), [Evaluators](tools/evaluator/README.md), and more that elevate your dataset handling capabilities.
+
+- **Systematic & Reusable**: Empowering users with a systematic library of reusable [config recipes](configs) and [OPs](docs/Operators.md), designed to function independently of specific datasets, models, or tasks.
+
+- **Data-in-the-loop**: Allowing detailed data analyses with an automated report generation feature for a deeper understanding of your dataset. Coupled with real-time multi-dimension automatic evaluation capabilities, it supports a [feedback loop](https://modelscope.cn/studios/Data-Juicer/data_process_loop/summary) at multiple stages in the LLM development process.
+
+- **Comprehensive Processing Recipes**: Offering tens of [pre-built data processing recipes](configs/refine_recipe/README.md) for pre-training, SFT, en, zh, and more scenarios.
+
+- **User-Friendly Experience**: Designed for simplicity, with [comprehensive documentation](#documentation-|-文档), [easy start guides](#quick-start) and [demo configs](configs/), and intuitive configuration with simple adding/removing OPs from [existing configs](configs/config_all.yaml).
+
+- **Flexible & Extensible**: Accommodating most types of data formats (e.g., jsonl, parquet, csv, ...) and allowing flexible combinations of OPs. Feel free to [implement your own OPs](docs/DeveloperGuide.md#build-your-own-ops) for customizable data processing.
+
+- **Enhanced Efficiency**: Providing a speedy data processing pipeline requiring less memory, optimized for maximum productivity.
+
+## Prerequisites
+
+- Recommend Python==3.8
+- gcc >= 5 (at least C++14 support)
+
+## Installation
+
+- Run the following commands to install the latest `data_juicer` version in
+ editable mode:
+```shell
+cd
+pip install -v -e .[all]
+```
+
+- Or install optional dependencies:
+```shell
+cd
+pip install -v -e . # install a minimal dependencies
+pip install -v -e .[tools] # install a subset of tools dependencies
+```
+
+The dependency options are listed below:
+
+| Tag | Description |
+|----------|------------------------------------------------------------------------|
+| . | Install minimal dependencies for basic Data-Juicer. |
+| .[all] | Install all optional dependencies (all of the following) |
+| .[dev] | Install dependencies for developing the package as contributors |
+| .[tools] | Install dependencies for dedicated tools, such as quality classifiers. |
+
+- Installation check:
+```python
+import data_juicer as dj
+print(dj.__version__)
+```
+
+## Quick Start
+
+
+### Data Processing
+
+- Run `process_data.py` tool with your config as the argument to process
+ your dataset.
+
+```shell
+python tools/process_data.py --config configs/demo/process.yaml
+```
+
+- **NOTICE**: For some operators that involve third-party models or resources which are not stored locally on your computer, it might be slow for the first running because these ops need to download corresponding resources into a directory first.
+The default download cache directory is `~/.cache/data_juicer`. Change the cache location by setting the shell environment variable, `DATA_JUICER_CACHE_HOME` to another directory, and you can also change `DATA_JUICER_MODELS_CACHE` or `DATA_JUICER_ASSETS_CACHE` in the same way:
+
+```shell
+# cache home
+export DATA_JUICER_CACHE_HOME="/path/to/another/directory"
+# cache models
+export DATA_JUICER_MODELS_CACHE="/path/to/another/directory/models"
+# cache assets
+export DATA_JUICER_ASSETS_CACHE="/path/to/another/directory/assets"
+```
+
+### Data Analysis
+- Run `analyze_data.py` tool with your config as the argument to analyse your dataset.
+
+```shell
+python tools/analyze_data.py --config configs/demo/analyser.yaml
+```
+
+- **NOTICE**: Analyser only compute stats of Filter ops. So extra Mapper or Deduplicator ops will be ignored in the analysis process.
+
+### Data Visualization
+
+- Run `app.py` tool to visualize your dataset in your browser.
+
+```shell
+streamlit run app.py
+```
+
+### Build Up Config Files
+
+- Config files specify some global arguments, and an operator list for the
+ data process. You need to set:
+ - Global arguments: input/output dataset path, number of workers, etc.
+ - Operator list: list operators with their arguments used to process the dataset.
+- You can build up your own config files by:
+ - ➖:Modify from our example config file [`config_all.yaml`](configs/config_all.yaml) which includes **all** ops and default
+ arguments. You just need to **remove** ops that you won't use and refine
+ some arguments of ops.
+ - ➕:Build up your own config files **from scratch**. You can refer our
+ example config file [`config_all.yaml`](configs/config_all.yaml), [op documents](docs/Operators.md), and advanced [Build-Up Guide for developers](docs/DeveloperGuide.md#build-your-own-configs).
+ - Besides the yaml files, you also have the flexibility to specify just
+ one (of several) parameters on the command line, which will override
+ the values in yaml files, e.g., `python xxx.py --config configs/demo/process.yaml --language_id_score_filter.lang=en`
+- The basic config format and definition is shown below.
+
+ 
+
+### Preprocess Raw Data (Optional)
+- Our formatters support some common input dataset formats for now:
+ - Multi-sample in one file: jsonl/json, parquet, csv/tsv, etc.
+ - Single-sample in one file: txt, code, docx, pdf, etc.
+- However, data from different sources are complicated and diverse. Such as:
+ - [Raw arxiv data downloaded from S3](https://info.arxiv.org/help/bulk_data_s3.html) include thousands of tar files and even more gzip files in them, and expected tex files are embedded in the gzip files so they are hard to obtain directly.
+ - Some crawled data include different kinds of files (pdf, html, docx, etc.). And extra information like tables, charts, and so on is hard to extract.
+- It's impossible to handle all kinds of data in Data-Juicer, issues/PRs are welcome to contribute to process new data types!
+- Thus, we provide some **common preprocessing tools** in [`tools/preprocess`](tools/preprocess/) for you to preprocess these data.
+ - You are welcome to make your contributions to new preprocessing tools for the community.
+ - We **highly recommend** that complicated data can be preprocessed to jsonl or parquet files.
+
+## Documentation | 文档
+
+- [Overall](README.md) | [概览](README_ZH.md)
+- [Operator Zoo](docs/Operators.md) | [算子库](docs/Operators_ZH.md)
+- [Configs](configs/README.md) | [配置系统](configs/README_ZH.md)
+- [Developer Guide](docs/DeveloperGuide.md) | [开发者指南](docs/DeveloperGuide_ZH.md)
+- Dedicated Toolkits | 专用工具箱
+ - [Quality Classifier](tools/quality_classifier/README.md) | [质量分类器](tools/quality_classifier/README_ZH.md)
+ - [Auto Evaluation](tools/evaluator/README.md) | [自动评测](tools/evaluator/README_ZH.md)
+ - [Preprocess](tools/preprocess/README.md) | [前处理](tools/preprocess/README_ZH.md)
+ - [Postprocess](tools/postprocess/README.md) | [后处理](tools/postprocess/README_ZH.md)
+- [Third-parties (LLM Ecosystems)](thirdparty/README.md) | [第三方库(大语言模型生态)](thirdparty/README_ZH.md)
+- [API references](https://alibaba.github.io/data-juicer/)
+
+## Data Recipes
+- [Recipes for data process in BLOOM](configs/bloom/README.md)
+- [Recipes for data process in RedPajama](configs/redpajama/README.md)
+- [Refined recipes for pretraining data](configs/refine_recipe/README.md)
+- [Refined recipes for SFT data](configs/refine_recipe/README.md#L28)
+
+## Demos
+- Introduction to Data-Juicer [[ModelScope](https://modelscope.cn/studios/Data-Juicer/overview_scan/summary)]
+- Data Visualization:
+ - Basic Statistics [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_statistics/summary)]
+ - Lexical Diversity [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_diversity/summary)]
+ - Operator Effect [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_op_effect/summary)]
+- Data Processing:
+ - Scientific Literature (e.g. [ArXiv](https://info.arxiv.org/help/bulk_data_s3.html)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sci_data/summary)]
+ - Programming Code (e.g. [TheStack](https://huggingface.co/datasets/bigcode/the-stack)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_code_data/summary)]
+ - Chinese Instruction Data (e.g. [Alpaca-CoT](https://huggingface.co/QingyiSi/Alpaca-CoT)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/sft_data_zh/summary)]
+- Tool Pool:
+ - Quality Classifier for CommonCrawl [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_quality_classifier/summary)]
+ - Auto Evaluation on [HELM](https://github.com/stanford-crfm/helm) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/auto_evaluation_helm/summary)]
+ - Data Sampling and Mixture [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_mixture/summary)]
+- Data Process Loop [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_loop/summary)]
+- Data Process HPO [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_hpo/summary)]
+
+## License
+Data-Juicer is released under Apache License 2.0.
+
+## Contributing
+We greatly welcome contributions of new features, bug fixes, and discussions. Please refer to [How-to Guide for Developers](docs/DeveloperGuide.md).
+
+## References
+Our paper is coming soon!
diff --git a/README_ZH.md b/README_ZH.md
new file mode 100644
index 000000000..f63973b3a
--- /dev/null
+++ b/README_ZH.md
@@ -0,0 +1,214 @@
+# Data-Juicer: 为大语言模型提供更高质量、更丰富、更易“消化”的数据
+
+
+
+
+
+[](docs/DeveloperGuide_ZH.md)
+
+[](#documentation-|-文档)
+[](README_ZH.md)
+[](https://alibaba.github.io/data-juicer/)
+[](#demos)
+[](#data-recipes)
+
+[](tools/quality_classifier/README_ZH.md)
+[](tools/evaluator/README_ZH.md)
+
+Data-Juicer 是一个以数据为中心的文本处理系统,旨在为大语言模型 (LLM) 提供更高质量、更丰富、更易“消化”的数据。
+本项目在积极更新和维护中,我们将定期强化和新增更多的功能和数据菜谱。欢迎您加入我们推进 LLM 数据的开发和研究工作!
+
+----
+
+目录
+===
+
+* [Data-Juicer: 为大语言模型提供更高质量、更丰富、更易“消化”的数据](#data-juicer-为大语言模型提供更高质量、更丰富、更易“消化”的数据)
+* [目录](#目录)
+ * [特点](#特点)
+ * [前置条件](#前置条件)
+ * [安装](#安装)
+ * [快速上手](#快速上手)
+ * [数据处理](#数据处理)
+ * [数据分析](#数据分析)
+ * [数据可视化](#数据可视化)
+ * [构建配置文件](#构建配置文件)
+ * [预处理原始数据(可选)](#预处理原始数据(可选))
+ * [Documentation | 文档](#documentation-|-文档)
+ * [数据处理菜谱](#数据处理菜谱)
+ * [演示样例](#演示样例)
+ * [开源协议](#开源协议)
+ * [贡献](#贡献)
+ * [参考文献](#参考文献)
+
+## 特点
+
+* **丰富的算子**: 内置了 50 多个核心 [算子(OPs)](docs/Operators_ZH.md),包括 Formatters,Mappers,Filters,Deduplicators 等。
+
+* **专业的工具库**: 提供功能丰富的专业工具库,例如 [文本质量打分器](tools/quality_classifier/README_ZH.md), [数据分割器](tools/preprocess/README_ZH.md), [分析器](#数据分析), [评估器](tools/evaluator/README_ZH.md) 等,提升您的数据处理能力。
+
+* **系统化 & 可复用**: 为用户提供系统化且可复用的[配置菜谱](configs)和[算子库](docs/Operators_ZH.md),旨在让数据处理独立于特定的数据集、模型或任务运行。
+
+* **数据反馈回路**: 支持详细的数据分析,并提供自动报告生成功能,使您深入了解您的数据集。结合实时多维度自动评估功能,支持在 LLM 开发过程的多个阶段进行[反馈循环](https://modelscope.cn/studios/Data-Juicer/data_process_loop/summary)。
+
+* **全面的处理菜谱**: 为预训练、SFT、中英文等场景提供数十种[预构建的数据处理菜谱](configs/refine_recipe/README_ZH.md)。
+
+* **用户友好**: 设计简单易用,提供全面的[文档](#documentation-|-文档)、简易[入门指南](#快速上手)和[演示配置](configs/),并且可以轻松地添加/删除[现有配置](configs/config_all.yaml)中的算子。
+
+* **灵活 & 易扩展**: 支持大多数数据格式(如jsonl、parquet、csv等),并允许灵活组合算子。支持[自定义算子](docs/DeveloperGuide_ZH.md#构建自己的算子),以执行定制化的数据处理。
+
+* **效率增强**: 提供高效的数据处理流水线,减少内存占用,提高生产力。
+
+## 前置条件
+
+* 推荐 Python==3.8
+* gcc >= 5 (at least C++14 support)
+
+## 安装
+
+* 运行以下命令以安装 `data_juicer` 可编辑模式的最新版本
+
+```shell
+cd
+pip install -v -e .[all]
+```
+
+* 或是安装可选的依赖项:
+
+```shell
+cd
+pip install -v -e . # 安装最小依赖
+pip install -v -e .[tools] # 安装部分工具库的依赖
+```
+
+依赖选项如下表所示:
+
+| 标签 | 描述 |
+|----------|----------------------------------------------|
+| . | 安装支持 Data-Juicer 基础功能的最小依赖项 |
+| .[all] | 安装所有可选依赖项(即下面所有依赖项) |
+| .[dev] | 安装作为贡献者开发 Data-Juicer 所需的依赖项 |
+| .[tools] | 安装专用工具库(如质量分类器)所需的依赖项 |
+
+* 核验安装是否成功:
+
+```python
+import data_juicer as dj
+print(dj.__version__)
+```
+
+## 快速上手
+
+### 数据处理
+
+* 以配置文件路径作为参数来运行 `process_data.py` 来处理数据集。
+
+```shell
+python tools/process_data.py --config configs/demo/process.yaml
+```
+
+* **注意**: 使用未保存在本地的第三方模型或资源的算子第一次运行可能会很慢,因为这些算子需要将相应的资源下载到缓存目录中。默认的下载缓存目录为`~/.cache/data_juicer`。您可通过设置 shell 环境变量 `DATA_JUICER_CACHE_HOME` 更改缓存目录位置,您也可以通过同样的方式更改 `DATA_JUICER_MODELS_CACHE` 或 `DATA_JUICER_ASSETS_CACHE` 来分别修改模型缓存或资源缓存目录:
+
+```shell
+# 缓存主目录
+export DATA_JUICER_CACHE_HOME="/path/to/another/directory"
+# 模型缓存目录
+export DATA_JUICER_MODELS_CACHE="/path/to/another/directory/models"
+# 资源缓存目录
+export DATA_JUICER_ASSETS_CACHE="/path/to/another/directory/assets"
+```
+
+### 数据分析
+
+- 以配置文件路径为参数运行 `analyze_data.py` 来分析数据集。
+
+```shell
+python tools/analyze_data.py --config configs/demo/analyser.yaml
+```
+
+* **注意**: Analyser 只计算 Filter 算子的状态,其他的算子(例如 Mapper 和 Deduplicator)会在分析过程中被忽略。
+
+### 数据可视化
+
+* 运行 `app.py` 来在浏览器中可视化您的数据集。
+
+```shell
+streamlit run app.py
+```
+
+### 构建配置文件
+
+* 配置文件包含一系列全局参数和用于数据处理的算子列表。您需要设置:
+ * 全局参数: 输入/输出 数据集路径,worker 进程数量等。
+ * 算子列表:列出用于处理数据集的算子及其参数。
+* 您可以通过如下方式构建自己的配置文件:
+ * ➖:修改我们的样例配置文件 [`config_all.yaml`](configs/config_all.yaml)。该文件包含了**所有**算子以及算子对应的默认参数。您只需要**移除**不需要的算子并重新设置部分算子的参数即可。
+ * ➕:从头开始构建自己的配置文件。您可以参考我们提供的样例配置文件 [`config_all.yaml`](configs/config_all.yaml),[算子文档](docs/Operators_ZH.md),以及 [开发者指南](docs/DeveloperGuide_ZH.md#构建自己的算子).
+ * 除了使用 yaml 文件外,您还可以在命令行上指定一个或多个参数,这些参数将覆盖 yaml 文件中的值,例如:`python xxx.py --config configs/demo/process.yaml --language_id_score_filter.lang=en`
+* 基础的配置项格式及定义如下图所示
+
+ 
+
+### 预处理原始数据(可选)
+
+* 我们的 Formatter 目前支持一些常见的输入数据集格式:
+ * 单个文件中包含多个样本:jsonl/json、parquet、csv/tsv 等。
+ * 单个文件中包含单个样本:txt、code、docx、pdf 等。
+* 但来自不同源的数据是复杂和多样化的,例如:
+ * [从 S3 下载的 arxiv 原始数据](https://info.arxiv.org/help/bulk_data_s3.html) 包括数千个 tar 文件以及更多的 gzip 文件,并且所需的 tex 文件在 gzip 文件中,很难直接获取。
+ * 一些爬取的数据包含不同类型的文件(pdf、html、docx 等),并且很难提取额外的信息,例如表格、图表等。
+* Data-Juicer 不可能处理所有类型的数据,欢迎提 Issues/PRs,贡献对新数据类型的处理能力!
+* 因此我们在 [`tools/preprocess`](tools/preprocess) 中提供了一些**常见的预处理工具**,用于预处理这些类型各异的数据。
+ * 欢迎您为社区贡献新的预处理工具。
+ * 我们**强烈建议**将复杂的数据预处理为 jsonl 或 parquet 文件。
+
+## Documentation | 文档
+
+* [Overall](README.md) | [概览](README_ZH.md)
+* [Operator Zoo](docs/Operators.md) | [算子库](docs/Operators_ZH.md)
+* [Configs](configs/README.md) | [配置系统](configs/README_ZH.md)
+* [Developer Guide](docs/DeveloperGuide.md) | [开发者指南](docs/DeveloperGuide_ZH.md)
+* Dedicated Toolkits | 专用工具箱
+ * [Quality Classifier](tools/quality_classifier/README.md) | [质量分类器](tools/quality_classifier/README_ZH.md)
+ * [Auto Evaluation](tools/evaluator/README.md) | [自动评测](tools/evaluator/README_ZH.md)
+ * [Preprocess](tools/preprocess/README.md) | [前处理](tools/preprocess/README_ZH.md)
+ * [Postprocess](tools/postprocess/README.md) | [后处理](tools/postprocess/README_ZH.md)
+* [Third-parties (LLM Ecosystems)](thirdparty/README.md) | [第三方库(大语言模型生态)](thirdparty/README_ZH.md)
+* [API references](https://alibaba.github.io/data-juicer/)
+
+## 数据处理菜谱
+
+* [BLOOM 数据处理菜谱](configs/bloom/README_ZH.md)
+* [RedPajama 数据处理菜谱](configs/redpajama/README_ZH.md)
+* [预训练数据增强菜谱](configs/refine_recipe/README_ZH.md)
+* [SFT数据增强菜谱](configs/refine_recipe/README_ZH.md#L32)
+
+## 演示样例
+
+* Data-Juicer 介绍 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/overview_scan/summary)]
+* 数据可视化:
+ * 基础指标统计 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_statistics/summary)]
+ * 词汇多样性 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_diversity/summary)]
+ * 算子效果 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_op_effect/summary)]
+* 数据处理:
+ * 科学文献 (例如 [ArXiv](https://info.arxiv.org/help/bulk_data_s3.html)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sci_data/summary)]
+ * 编程代码 (例如 [TheStack](https://huggingface.co/datasets/bigcode/the-stack)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_code_data/summary)]
+ * 中文指令数据 (例如 [Alpaca-CoT](https://huggingface.co/QingyiSi/Alpaca-CoT)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/sft_data_zh/summary)]
+* 工具池:
+ * CommonCrawl 质量分类器 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_quality_classifier/summary)]
+ * 基于 [HELM](https://github.com/stanford-crfm/helm) 的自动评测 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/auto_evaluation_helm/summary)]
+ * 数据采样及混合 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_mixture/summary)]
+* 数据处理回路 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_loop/summary)]
+* 数据处理 HPO [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_hpo/summary)]
+
+## 开源协议
+
+Data-Juicer 在 Apache License 2.0 协议下发布。
+
+## 贡献
+
+我们非常欢迎贡献新功能、修复漏洞以及讨论。请参考[开发者指南](docs/DeveloperGuide_ZH.md)。
+
+## 参考文献
+
+我们的论文即将发布!
diff --git a/app.py b/app.py
new file mode 100644
index 000000000..f7f842e9d
--- /dev/null
+++ b/app.py
@@ -0,0 +1,765 @@
+# Some code here has been modified from:
+# https://huggingface.co/spaces/huggingface/text-data-filtering
+# --------------------------------------------------------
+
+import copy
+import math
+import os
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import streamlit as st
+import yaml
+from loguru import logger
+
+import plotly.express as px
+from data_juicer.analysis.diversity_analysis import (DiversityAnalysis,
+ get_diversity,
+ prepare_diversity_model)
+from data_juicer.config import init_configs
+from data_juicer.core import Analyser, Executor
+from data_juicer.ops.base_op import OPERATORS
+from data_juicer.utils.logger_utils import get_log_file_path
+
+
+@st.cache_data
+def convert_csv(df):
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
+ return df.to_csv().encode('utf-8')
+
+
+@st.cache_data
+def convert_jsonl(df):
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
+ return df.to_json(orient='records', lines=True).encode('utf-8')
+
+
+@st.cache_data
+def get_diversity_model(lang):
+ diversity_model = prepare_diversity_model(lang)
+ return diversity_model
+
+
+@st.cache_data
+def postproc_diversity(dataframe, **kwargs):
+ df = get_diversity(dataframe, **kwargs)
+ return df
+
+
+def read_log_file():
+ log_f_path = get_log_file_path()
+ if log_f_path is None or not os.path.exists(log_f_path):
+ return ''
+ sys.stdout.flush()
+ with open(log_f_path, 'r') as f:
+ return f.read()
+
+
+def pretty_out(d):
+ res = ''
+ process = ''
+ op_names = set(OPERATORS.modules.keys())
+ for key, value in d.items():
+ if key == 'process':
+ process = yaml.dump(value,
+ allow_unicode=True,
+ default_flow_style=False)
+ elif key == 'config' or key.split('.')[0] in op_names:
+ continue
+ else:
+ res += f'{key}:\n \t {value}\n'
+ res += 'process:\n' + \
+ '\n'.join(['\t' + line for line in process.splitlines()])
+
+ return res
+
+
+def parse_cfg():
+
+ cfg_file = st.session_state.input_cfg_file
+ cfg_cmd = st.session_state.input_cfg_cmd
+
+ cfg_f_name = 'null'
+ del_cfg_file = False
+ if cfg_file is not None:
+ cfg_f_name = cfg_file.name
+ file_contents = cfg_file.getvalue()
+ with open(cfg_f_name, 'wb') as f:
+ f.write(file_contents)
+ cfg_cmd = f'--config {cfg_f_name}'
+ del_cfg_file = True
+
+ args_in_cmd = cfg_cmd.split()
+
+ if len(args_in_cmd) >= 2 and args_in_cmd[0] == '--config':
+ cfg_f_name = args_in_cmd[1]
+ else:
+ st.warning('Please specify a config command or upload a config file.')
+ st.stop()
+
+ if not os.path.exists(cfg_f_name):
+ st.warning('do not parse'
+ f'config file does not exist with cfg_f_name={cfg_f_name}')
+ st.stop()
+
+ with open(cfg_f_name, 'r') as cfg_f:
+ specified_cfg = yaml.safe_load(cfg_f)
+
+ try:
+ parsed_cfg = init_configs(args=args_in_cmd)
+ st.session_state.cfg = parsed_cfg
+ if del_cfg_file:
+ os.remove(cfg_f_name)
+ return pretty_out(parsed_cfg), pretty_out(specified_cfg), parsed_cfg
+ except Exception as e:
+ return str(e), pretty_out(specified_cfg), None
+
+
+def analyze_and_show_res():
+ images_ori = []
+ cfg = st.session_state.get('cfg', parse_cfg()[2])
+ if cfg is None:
+ raise ValueError('you have not specify valid cfg')
+ # force generating separate figures
+ cfg['save_stats_in_one_file'] = True
+
+ logger.info('=========Stage 1: analyze original data=========')
+ analyzer = Analyser(cfg)
+ dataset = analyzer.run()
+
+ analysis_res_ori = pd.read_csv(
+ os.path.join(analyzer.analysis_path, 'overall.csv'))
+ for f_path in os.listdir(analyzer.analysis_path):
+ if '.png' in f_path and 'all-stats' in f_path:
+ images_ori.append(os.path.join(analyzer.analysis_path, f_path))
+
+ st.session_state.dataset = dataset
+ st.session_state.orginal_overall = analysis_res_ori
+ st.session_state.original_imgs = images_ori
+
+
+def process_and_show_res():
+ images_processed = []
+ cfg = st.session_state.get('cfg', parse_cfg()[2])
+ if cfg is None:
+ raise ValueError('you have not specify valid cfg')
+ # force generating separate figures
+ cfg['save_stats_in_one_file'] = True
+ logger.info('=========Stage 2: process original data=========')
+ executor = Executor(cfg)
+ dataset = executor.run()
+
+ logger.info('=========Stage 3: analyze the processed data==========')
+ analysis_res_processed = pd.DataFrame()
+ try:
+ cfg_for_processed_data = copy.deepcopy(cfg)
+ cfg_for_processed_data.dataset_path = cfg.export_path
+
+ cfg_for_processed_data.export_path = os.path.dirname(
+ cfg.export_path) + '_processed/data.jsonl'
+ cfg_for_processed_data.text_keys_to_load = [cfg.text_key_to_process]
+ analyzer = Analyser(cfg_for_processed_data)
+ analyzer.run()
+ analysis_res_processed = pd.read_csv(
+ os.path.join(analyzer.analysis_path, 'overall.csv'))
+ for f_path in os.listdir(analyzer.analysis_path):
+ if '.png' in f_path and 'all-stats' in f_path:
+ images_processed.append(
+ os.path.join(analyzer.analysis_path, f_path))
+ except Exception as e:
+ st.warning(f'Something error with {str(e)}')
+
+ logger.info('=========Stage 4: Render the analysis results==========')
+ st.session_state.dataset = dataset
+ st.session_state.processed_overall = analysis_res_processed
+ st.session_state.processed_imgs = images_processed
+
+
+def get_min_max_step(data):
+ max_value = np.max(data)
+ if max_value > 2.0:
+ min_value = 0
+ max_value = int(max_value + 1)
+ step = 1
+ else:
+ min_value = 0.0
+ max_value = max(1.0, max_value)
+ step = 0.01
+ return min_value, max_value, step
+
+
+op_stats_dict = {
+ 'alphanumeric_filter': ['alpha_token_ratio', 'alnum_ratio'],
+ 'average_line_length_filter': ['avg_line_length'],
+ 'character_repetition_filter': ['char_rep_ratio'],
+ 'flagged_words_filter': ['flagged_words_ratio'],
+ 'language_id_score_filter': ['lang', 'lang_score'],
+ 'maximum_line_length_filter': ['max_line_length'],
+ 'perplexity_filter': ['perplexity'],
+ 'special_characters_filter': ['special_char_ratio'],
+ 'stopwords_filter': ['stopwords_ratio'],
+ 'text_length_filter': ['text_len'],
+ 'words_num_filter': ['num_words'],
+ 'word_repetition_filter': ['word_rep_ratio'],
+}
+
+
+class Visualize:
+
+ @staticmethod
+ def filter_dataset(dataset):
+
+ text = dataset['text']
+ if 'stats' not in dataset.features:
+ stats = pd.DataFrame(dataset['stats.meta'])
+ else:
+ stats = pd.DataFrame(dataset['stats'])
+ stats['text'] = text
+
+ non_num_list = ['lang']
+ min_cutoff_list = [
+ 'lang_score',
+ 'stopwords_ratio',
+ ]
+ max_cutoff_list = [
+ 'flagged_words_ratio',
+ 'max_ppl',
+ ]
+ mask_list = ['text']
+
+ cfg = st.session_state.get('cfg', None)
+ if cfg is None:
+ return
+
+ def set_sliders(total_stats, ordered):
+ stats = copy.deepcopy(total_stats)
+ conds = list()
+ index = 1
+ for op_cfg in cfg.process:
+ op_name = list(op_cfg.keys())[0]
+ op_stats = op_stats_dict.get(op_name, [])
+
+ cutoff_ratio = None
+
+ with st.sidebar.expander(f'{index} {op_name}'):
+
+ for column_name in op_stats:
+ if column_name not in stats:
+ continue
+ data = stats[column_name]
+
+ if column_name in non_num_list:
+ options = ['all'] + list(set(data))
+ label = f'Which {column_name} would \
+ you like to keep?'
+
+ selected = st.selectbox(
+ label=label,
+ options=options,
+ )
+ if selected == 'all':
+ cond = [True] * len(data)
+ else:
+ cond = data == selected
+ Visualize.display_discarded_ratio(
+ cond, column_name)
+
+ elif column_name in min_cutoff_list:
+ label = f'If the {column_name} of a document \
+ is lower than this number, \
+ the document is removed.'
+
+ low, high, step = get_min_max_step(data)
+
+ cutoff_ratio = st.slider(label,
+ low,
+ high,
+ low,
+ step=step)
+ cond = data >= cutoff_ratio
+ Visualize.display_discarded_ratio(
+ cond, column_name)
+
+ elif column_name in max_cutoff_list:
+ label = f'If the {column_name} of a document \
+ is higher than this number, \
+ the document is removed.'
+
+ low, high, step = get_min_max_step(data)
+ cutoff_ratio = st.slider(label,
+ low,
+ high,
+ high,
+ step=step)
+ cond = data <= cutoff_ratio
+
+ Visualize.display_discarded_ratio(
+ cond, column_name)
+ elif column_name not in mask_list:
+ # lower
+ label = f'If the {column_name} of a document \
+ is lower than this number, \
+ the document is removed.'
+
+ low, high, step = get_min_max_step(data)
+
+ cutoff_ratio_l = st.slider(label,
+ low,
+ high,
+ low,
+ step=step)
+ cond_l = data >= cutoff_ratio_l
+
+ Visualize.display_discarded_ratio(
+ cond_l, column_name)
+
+ # higher
+ label = f'If the {column_name} of a document \
+ is higher than this number, \
+ the document is removed.'
+
+ cutoff_ratio_h = st.slider(label,
+ low,
+ high,
+ high,
+ step=step)
+
+ cond_h = data <= cutoff_ratio_h
+ Visualize.display_discarded_ratio(
+ cond_h, column_name)
+ cond = [
+ low & high
+ for low, high in zip(cond_l, cond_h)
+ ]
+
+ cutoff_ratio = (cutoff_ratio_l, cutoff_ratio_h)
+
+ if column_name not in mask_list:
+ Visualize.draw_hist(data, cutoff_ratio)
+ conds.append({
+ (' '.join([str(index), op_name]), column_name):
+ cond
+ })
+
+ if ordered:
+ stats = stats.loc[cond]
+ index += 1
+ return conds, stats
+
+ st.sidebar.subheader('Parameters of filter ops')
+ ordered = st.sidebar.checkbox('Process by op order')
+ conds, filtered_stats = set_sliders(stats, ordered)
+
+ st.subheader('How many samples do you want to show?')
+ show_num = st.number_input(
+ label='How many samples do you want to show?',
+ value=5,
+ label_visibility='hidden')
+ if ordered:
+ all_conds = [
+ True if i in filtered_stats.index else False
+ for i in range(len(stats))
+ ]
+ else:
+ all_conds = np.all([list(cond.values())[0] for cond in conds],
+ axis=0)
+ ds = pd.DataFrame(dataset)
+ Visualize.display_dataset(ds, all_conds, show_num, 'Retained sampels',
+ 'docs')
+ st.download_button('Download Retained data as JSONL',
+ data=convert_jsonl(ds.loc[all_conds]),
+ file_name='retained.jsonl')
+ Visualize.display_dataset(ds, np.invert(all_conds), show_num,
+ 'Discarded sampels', 'docs')
+ st.download_button('Download Discarded data as JSONL',
+ data=convert_jsonl(ds.loc[np.invert(all_conds)]),
+ file_name='discarded.jsonl')
+ display_discarded_details = st.checkbox(
+ 'Display discarded documents by filter details')
+
+ show_stats = copy.deepcopy(stats)
+ bar_labels = []
+ bar_sizes = []
+ for item in conds:
+ for op_key, cond in item.items():
+ op_name, column_name = op_key
+ if column_name not in mask_list:
+ sub_stats = show_stats[[column_name, 'text']]
+ if display_discarded_details:
+ Visualize.display_dataset(
+ sub_stats,
+ np.invert(cond) if len(cond) > 0 else [],
+ show_num,
+ # f'Discarded documents for the filter on \
+ f'{op_name} {column_name} filtered ',
+ 'docs',
+ )
+ before_filtered_num = len(show_stats.index)
+ if ordered:
+ show_stats = show_stats.loc[cond]
+ retained = np.sum(1 * cond)
+ filtered = before_filtered_num - len(show_stats.index)
+ else:
+ retained = np.sum(1 * cond)
+ filtered = before_filtered_num - retained
+
+ bar_sizes.append(retained)
+ bar_sizes.append(filtered)
+ bar_labels.append(f'{op_name}\n{column_name}')
+
+ bar_title = 'Effect of Filter OPs'
+ Visualize.draw_stack_bar(bar_sizes, bar_labels, len(stats.index),
+ bar_title)
+
+ @staticmethod
+ def diversity():
+ with st.expander('Diversity for sft dataset', expanded=False):
+ dataset = st.session_state.get('dataset', None)
+ cfg = st.session_state.get('cfg', parse_cfg()[2])
+ if dataset:
+
+ col1, col2, col3, col4 = st.columns(4)
+ with col1:
+ label = 'Which language of your dataset'
+ options = ['en', 'zh']
+ lang_select = st.selectbox(
+ label=label,
+ options=options,
+ )
+ with col2:
+ top_k_verbs = st.number_input(
+ 'Set the top_k nums of verbs', value=20)
+ with col3:
+ top_k_nouns = st.number_input(
+ 'Set the top_k nums of nouns', value=4)
+ with col4:
+ threshold = st.slider('Count threshold',
+ min_value=0,
+ value=32,
+ max_value=100,
+ step=1)
+
+ disversity_btn = st.button('Analyse_diversity',
+ use_container_width=True)
+ output_path = os.path.join(os.path.dirname(cfg.export_path),
+ 'analysis')
+ raw_df = None
+ if disversity_btn:
+ try:
+ diversity_analysis = DiversityAnalysis(
+ dataset, output_path)
+ with st.spinner('Wait for analyze diversity...'):
+ raw_df = diversity_analysis.compute(
+ lang_or_model=get_diversity_model(lang_select),
+ column_name=cfg.text_key_to_process)
+
+ st.session_state[f'diversity{lang_select}'] = raw_df
+
+ except Exception as e:
+ st.warning(f'Error {str(e)} in {lang_select}')
+ else:
+ raw_df = st.session_state.get(f'diversity{lang_select}',
+ None)
+
+ if raw_df is not None:
+ df = postproc_diversity(raw_df,
+ top_k_verbs=top_k_verbs,
+ top_k_nouns=top_k_nouns)
+ df = df[df['count'] >= threshold]
+ Visualize.draw_sunburst(df,
+ path=['verb', 'noun'],
+ values='count')
+
+ st.download_button(
+ label='Download diversity data as CSV',
+ data=convert_csv(df),
+ file_name='diversity.csv',
+ mime='text/csv',
+ )
+ else:
+ st.warning('Please analyze original data first')
+
+ @staticmethod
+ def draw_sunburst(df, path, values):
+
+ fig = px.sunburst(df, path=path, values=values)
+ fig.update_layout(margin=dict(l=0, r=0, t=0, b=0),
+ font_family='Times New Roman',
+ font=dict(size=40))
+ st.plotly_chart(fig, use_container_width=True)
+
+ @staticmethod
+ def draw_stack_bar(bar_sizes, bar_labels, total_num, title=''):
+ filtered_size = [
+ k / total_num * 100 for i, k in enumerate(bar_sizes[::-1])
+ if i % 2 == 0
+ ]
+ retain_size = [
+ k / total_num * 100 for i, k in enumerate(bar_sizes[::-1])
+ if i % 2 != 0
+ ]
+ plt.clf()
+ plt.title(title)
+ bar_labels = bar_labels[::-1]
+ # retained
+ r_bars = plt.barh(bar_labels,
+ retain_size,
+ label='Retained',
+ height=0.5,
+ color='limegreen')
+
+ # filtered
+ f_bars = plt.barh(bar_labels,
+ filtered_size,
+ label='Filtered',
+ left=retain_size,
+ height=0.5,
+ color='orangered')
+
+ for idx, bar in enumerate(r_bars):
+ width = bar.get_width()
+ plt.text(bar.get_x() + width / 2,
+ bar.get_y() + bar.get_height() / 2,
+ f'{retain_size[idx]:.2f}%',
+ ha='center',
+ va='center')
+
+ for idx, bar in enumerate(f_bars):
+ width = bar.get_width()
+ plt.text(bar.get_x() + width / 2,
+ bar.get_y() + bar.get_height() / 2,
+ f'{filtered_size[idx]:.2f}%',
+ ha='center',
+ va='center')
+
+ plt.legend()
+ plt.gcf()
+ st.pyplot(plt, use_container_width=True)
+
+ @staticmethod
+ def draw_pie(bar_labels, big_sizes, small_labels, bar_sizes):
+ plt.clf()
+
+ # filter op circle
+ plt.pie(big_sizes, labels=bar_labels, startangle=90, frame=True)
+ # retained and filtered circle
+ plt.pie(bar_sizes,
+ labels=small_labels,
+ radius=0.7,
+ rotatelabels=True,
+ startangle=90,
+ labeldistance=0.7)
+ centre_circle = plt.Circle((0, 0), 0.4, color='white', linewidth=0)
+ fig = plt.gcf()
+ fig.gca().add_artist(centre_circle)
+
+ plt.axis('equal')
+ plt.tight_layout()
+ st.pyplot(plt, use_container_width=True)
+
+ @staticmethod
+ def display_discarded_ratio(cond, key):
+ if len(cond) > 0:
+ st.caption(
+ f':red[{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}%] \
+ of the total (:red[{len(cond)}]) is discarded with {key}.')
+ else:
+ st.caption(f':red[{0:.2f}%] \
+ of the total (:red[0]) is discarded with {key}.')
+
+ @staticmethod
+ def display_dataset(dataframe, cond, show_num, desp, type, all=True):
+ examples = dataframe.loc[cond]
+ if all or len(examples) > 0:
+ st.subheader(
+ f'{desp}: :red[{len(examples)}] of '
+ f'{len(dataframe.index)} {type} '
+ f'(:red[{len(examples)/len(dataframe.index) * 100:.2f}%])')
+
+ # st.markdown('Click on a column to sort by it, \
+ # place the cursor on the text to display it.')
+ st.dataframe(examples[:show_num], use_container_width=True)
+
+ @staticmethod
+ def draw_hist(data, cutoff=None):
+
+ fig, ax = plt.subplots()
+ data_num = len(data)
+ if data_num >= 100:
+ rec_bins = int(math.sqrt(len(data)))
+ else:
+ rec_bins = 50
+
+ if data_num > 0:
+ ax.hist(data, bins=rec_bins, density=True)
+ if hasattr(data, 'name'):
+ ax.set_title(data.name)
+
+ if isinstance(cutoff, (float, int)):
+ ax.axvline(x=cutoff, color='r', linestyle='dashed')
+ elif isinstance(cutoff, tuple) and len(cutoff) == 2:
+ ax.axvline(x=cutoff[0], color='r', linestyle='dashed')
+ ax.axvline(x=cutoff[1], color='r', linestyle='dashed')
+ st.pyplot(fig)
+
+ @staticmethod
+ def setup():
+ st.set_page_config(
+ page_title='Data-Juicer',
+ page_icon=':smile',
+ layout='wide',
+ # initial_sidebar_state="expanded",
+ )
+
+ readme_link = 'https://code.alibaba-inc.com/DAIL-LLM/' \
+ 'data_juicer/blob/master/README.md'
+
+ st.markdown(
+ '
Data-Juicer '
+ '
',
+ unsafe_allow_html=True,
+ )
+ st.markdown(
+ f'
A Dataset Preparation System for Large Models, \
+ see more detail in Readme
', '')
+ raw_html = raw_html.replace('', '\n*')
+ raw_html = raw_html.replace('', '')
+ parser = HTMLParser(raw_html)
+ return parser.text()
+
+ sample[self.text_key] = _clean_html(sample[self.text_key])
+ return sample
diff --git a/data_juicer/ops/mapper/clean_ip_mapper.py b/data_juicer/ops/mapper/clean_ip_mapper.py
new file mode 100644
index 000000000..8f1ce9684
--- /dev/null
+++ b/data_juicer/ops/mapper/clean_ip_mapper.py
@@ -0,0 +1,34 @@
+import regex as re
+
+from ..base_op import OPERATORS, Mapper
+
+
+@OPERATORS.register_module('clean_ip_mapper')
+class CleanIpMapper(Mapper):
+ """Mapper to clean ipv4 and ipv6 address in text samples."""
+
+ def __init__(self, *args, **kwargs):
+ """
+ Initialization method.
+
+ :param args: extra args
+ :param kwargs: extra args
+ """
+
+ super().__init__(*args, **kwargs)
+ self.pattern = r'(?:(?:1[0-9][0-9]\.)|(?:2[0-4][0-9]\.)|'
+ self.pattern += r'(?:25[0-5]\.)|(?:[1-9][0-9]\.)|(?:[0-9]\.))'
+ self.pattern += r'{3}(?:(?:1[0-9][0-9])|(?:2[0-4][0-9])|'
+ self.pattern += r'(?:25[0-5])|(?:[1-9][0-9])|(?:[0-9]))|'
+ self.pattern += r'([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4}' # ipv6
+
+ def process(self, sample):
+
+ if not re.search(self.pattern, sample[self.text_key], flags=re.DOTALL):
+ return sample
+
+ sample[self.text_key] = re.sub(pattern=self.pattern,
+ repl=r'',
+ string=sample[self.text_key],
+ flags=re.DOTALL)
+ return sample
diff --git a/data_juicer/ops/mapper/clean_links_mapper.py b/data_juicer/ops/mapper/clean_links_mapper.py
new file mode 100644
index 000000000..b8d4945fe
--- /dev/null
+++ b/data_juicer/ops/mapper/clean_links_mapper.py
@@ -0,0 +1,39 @@
+# Some code here has been modified from:
+# https://github.com/kallewesterling/CleanText/
+# --------------------------------------------------------
+import regex as re
+
+from ..base_op import OPERATORS, Mapper
+
+
+@OPERATORS.register_module('clean_links_mapper')
+class CleanLinksMapper(Mapper):
+ """Mapper to clean links like http/https/ftp in text samples."""
+
+ def __init__(self, *args, **kwargs):
+ """
+ Initialization method.
+
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ super().__init__(*args, **kwargs)
+ self.pattern = r'(?i)\b('
+ self.pattern += r'(?:[a-z][\w-]+:(?:\/{1,3}|'
+ self.pattern += r'[a-z0-9%])|www\d{0,3}[.]|'
+ self.pattern += r'[a-z0-9.\-]+[.][a-z]{2,4}\/)'
+ self.pattern += r'(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))'
+ self.pattern += r'+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|'
+ self.pattern += r'[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])'
+ self.pattern += r')'
+
+ def process(self, sample):
+
+ if not re.search(self.pattern, sample[self.text_key], flags=re.DOTALL):
+ return sample
+
+ sample[self.text_key] = re.sub(pattern=self.pattern,
+ repl=r'',
+ string=sample[self.text_key],
+ flags=re.DOTALL)
+ return sample
diff --git a/data_juicer/ops/mapper/expand_macro_mapper.py b/data_juicer/ops/mapper/expand_macro_mapper.py
new file mode 100644
index 000000000..1792796ca
--- /dev/null
+++ b/data_juicer/ops/mapper/expand_macro_mapper.py
@@ -0,0 +1,80 @@
+# Some code here has been modified from:
+# https://github.com/togethercomputer/RedPajama-Data/blob/main/data_prep/arxiv/arxiv_cleaner.py
+# --------------------------------------------------------
+
+import regex as re
+
+from ..base_op import OPERATORS, Mapper
+
+
+@OPERATORS.register_module('expand_macro_mapper')
+class ExpandMacroMapper(Mapper):
+ """Mapper to expand macro definitions in the document body of Latex
+ samples."""
+
+ def __init__(self, *args, **kwargs):
+ """
+ Initialization method.
+
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ super().__init__(*args, **kwargs)
+
+ def _build_non_arg_macros_dict(self, file_content):
+ # regex for extracting \newcommand macros without arguments
+ non_arg_nc_reg = re.compile(
+ # this regex matches the following:
+ # \newcommand{\macro_name}{macro_value}
+ # \newcommand*{\macro_name}{macro_value}
+ # where macro_name is only allowed to contain letters and numbers;
+ # macro_value can contain any character.
+ pattern=r'\\\bnewcommand\b\*?\{(\\[a-zA-Z0-9]+?)\}\{(.*?)\}$',
+ flags=re.MULTILINE)
+
+ # regex for extracting \def macros without arguments
+ non_arg_def_reg = re.compile(
+ # this regex matches the following:
+ # \def\macro_name{macro_value}
+ # where macro_name is only allowed to contain letters and numbers;
+ # macro_value can contain any character.
+ pattern=r'\\def\s*(\\[a-zA-Z0-9]+?)\s*\{(.*?)\}$',
+ flags=re.MULTILINE)
+
+ # Extract all user-defined LaTeX macros from the preamble
+ macros = {}
+ for reg in [non_arg_nc_reg, non_arg_def_reg]:
+ for match in reg.finditer(file_content):
+ # convert the macro name and value to a raw string that can be
+ # used in re.sub
+ macro_name = match.group(1).encode('unicode-escape').decode(
+ 'utf-8')
+ macro_val = match.group(2).encode('unicode-escape').decode(
+ 'utf-8')
+
+ macros[macro_name] = macro_val
+ return macros
+
+ def process(self, sample):
+ non_arg_macros = self._build_non_arg_macros_dict(sample[self.text_key])
+
+ # TODO: macros that take arguments are not supported yet
+ arg_macros = {}
+
+ # inline-expand all non-arg macros
+ for macro_name, macro_value in non_arg_macros.items():
+ sample[self.text_key] = re.sub(
+ # make pattern grouped to make sure that the macro is not part
+ # of a longer alphanumeric word
+ pattern=r'(' + macro_name + r')' + r'([^a-zA-Z0-9])',
+ # replace the macro with its value and add back the character
+ # that was matched after the macro
+ repl=macro_value + r'\2',
+ string=sample[self.text_key])
+
+ # inline-expand all macros that use args
+ # TODO: inline-expand macros with args
+ for macro_name, macro_value in arg_macros.items():
+ pass
+
+ return sample
diff --git a/data_juicer/ops/mapper/fix_unicode_mapper.py b/data_juicer/ops/mapper/fix_unicode_mapper.py
new file mode 100644
index 000000000..275fbba28
--- /dev/null
+++ b/data_juicer/ops/mapper/fix_unicode_mapper.py
@@ -0,0 +1,21 @@
+import ftfy
+
+from ..base_op import OPERATORS, Mapper
+
+
+@OPERATORS.register_module('fix_unicode_mapper')
+class FixUnicodeMapper(Mapper):
+ """Mapper to fix unicode errors in text samples."""
+
+ def __init__(self, *args, **kwargs):
+ """
+ Initialization method.
+
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ super().__init__(*args, **kwargs)
+
+ def process(self, sample):
+ sample[self.text_key] = ftfy.fix_text(sample[self.text_key])
+ return sample
diff --git a/data_juicer/ops/mapper/punctuation_normalization_mapper.py b/data_juicer/ops/mapper/punctuation_normalization_mapper.py
new file mode 100644
index 000000000..e8cdf3e60
--- /dev/null
+++ b/data_juicer/ops/mapper/punctuation_normalization_mapper.py
@@ -0,0 +1,62 @@
+# Some code here has been modified from:
+# https://github.com/bigscience-workshop/data-preparation
+# --------------------------------------------------------
+
+from ..base_op import OPERATORS, Mapper
+
+
+@OPERATORS.register_module('punctuation_normalization_mapper')
+class PunctuationNormalizationMapper(Mapper):
+ """Mapper to normalize unicode punctuations to English punctuations in text
+ \ samples."""
+
+ def __init__(self, *args, **kwargs):
+ """
+ Initialization method.
+
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ super().__init__(*args, **kwargs)
+ self.punctuation_unicode = {
+ ',': ',',
+ '。': '.',
+ '、': ',',
+ '„': '"',
+ '”': '"',
+ '“': '"',
+ '«': '"',
+ '»': '"',
+ '1': '"',
+ '」': '"',
+ '「': '"',
+ '《': '"',
+ '》': '"',
+ '´': "'",
+ '∶': ':',
+ ':': ':',
+ '?': '?',
+ '!': '!',
+ '(': '(',
+ ')': ')',
+ ';': ';',
+ '–': '-',
+ '—': ' - ',
+ '.': '. ',
+ '~': '~',
+ '’': "'",
+ '…': '...',
+ '━': '-',
+ '〈': '<',
+ '〉': '>',
+ '【': '[',
+ '】': ']',
+ '%': '%',
+ '►': '-',
+ }
+
+ def process(self, sample):
+ sample[self.text_key] = ''.join([
+ self.punctuation_unicode.get(c, c) for c in sample[self.text_key]
+ ])
+ return sample
diff --git a/data_juicer/ops/mapper/remove_bibliography_mapper.py b/data_juicer/ops/mapper/remove_bibliography_mapper.py
new file mode 100644
index 000000000..7a5c815ca
--- /dev/null
+++ b/data_juicer/ops/mapper/remove_bibliography_mapper.py
@@ -0,0 +1,35 @@
+# Some code here has been modified from:
+# https://github.com/togethercomputer/RedPajama-Data/
+# --------------------------------------------------------
+
+import regex as re
+
+from ..base_op import OPERATORS, Mapper
+
+
+@OPERATORS.register_module('remove_bibliography_mapper')
+class RemoveBibliographyMapper(Mapper):
+ """Mapper to remove bibliography at the end of documents in Latex
+ samples."""
+
+ def __init__(self, *args, **kwargs):
+ """
+ Initialization method.
+
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ super().__init__(*args, **kwargs)
+ self.pattern = r'(\\appendix|'
+ self.pattern += r'\\begin\{references\}|'
+ self.pattern += r'\\begin\{REFERENCES\}|'
+ self.pattern += r'\\begin\{thebibliography\}|'
+ self.pattern += r'\\bibliography\{.*\}'
+ self.pattern += r').*$'
+
+ def process(self, sample):
+ sample[self.text_key] = re.sub(pattern=self.pattern,
+ repl=r'',
+ string=sample[self.text_key],
+ flags=re.DOTALL)
+ return sample
diff --git a/data_juicer/ops/mapper/remove_comments_mapper.py b/data_juicer/ops/mapper/remove_comments_mapper.py
new file mode 100644
index 000000000..f49bc9065
--- /dev/null
+++ b/data_juicer/ops/mapper/remove_comments_mapper.py
@@ -0,0 +1,55 @@
+# Some code here has been modified from:
+# https://github.com/togethercomputer/RedPajama-Data/
+# --------------------------------------------------------
+
+from typing import List, Union
+
+import regex as re
+
+from ..base_op import OPERATORS, Mapper
+
+
+@OPERATORS.register_module('remove_comments_mapper')
+class RemoveCommentsMapper(Mapper):
+ """
+ Mapper to remove comments in different kinds of documents.
+
+ Only support 'tex' \ for now.
+ """
+
+ def __init__(self,
+ doc_type: Union[str, List[str]] = 'tex',
+ inline: bool = True,
+ multiline: bool = True,
+ *args,
+ **kwargs):
+ """
+ Initialization method.
+
+ :param doc_type: Type of document to remove comments.
+ :param inline: Whether to remove inline comments.
+ :param multiline: Whether to remove multiline comments.
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ super().__init__(*args, **kwargs)
+ self.doc_type = doc_type
+ self.inline = inline
+ self.multiline = multiline
+
+ def process(self, sample):
+ # TODO: remove different comments by sample type
+
+ if self.inline:
+ # remove all in comments within a line
+ sample[self.text_key] = re.sub(pattern=r'[^\\]%.+$',
+ repl=r'',
+ string=sample[self.text_key],
+ flags=re.MULTILINE)
+
+ if self.multiline:
+ sample[self.text_key] = re.sub(pattern=r'(?m)^%.*\n?',
+ repl=r'',
+ string=sample[self.text_key],
+ flags=re.MULTILINE)
+ return sample
diff --git a/data_juicer/ops/mapper/remove_header_mapper.py b/data_juicer/ops/mapper/remove_header_mapper.py
new file mode 100644
index 000000000..4c36bde64
--- /dev/null
+++ b/data_juicer/ops/mapper/remove_header_mapper.py
@@ -0,0 +1,49 @@
+# Some code here has been modified from:
+# https://github.com/togethercomputer/RedPajama-Data/
+# --------------------------------------------------------
+
+import regex as re
+
+from ..base_op import OPERATORS, Mapper
+
+
+# TODO
+@OPERATORS.register_module('remove_header_mapper')
+class RemoveHeaderMapper(Mapper):
+ """Mapper to remove headers at the beginning of documents in Latex
+ samples."""
+
+ def __init__(self, drop_no_head: bool = True, *args, **kwargs):
+ """
+ Initialization method.
+
+ :param drop_no_head: whether to drop sample texts without
+ headers.
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ super().__init__(*args, **kwargs)
+ self.pattern = r'^(.*?)('
+ self.pattern += r'\\\bchapter\b\*?(?:\[(.*?)\])?\{(.*?)\}|'
+ self.pattern += r'\\\bpart\b\*?(?:\[(.*?)\])?\{(.*?)\}|'
+ self.pattern += r'\\\bsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|'
+ self.pattern += r'\\\bsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|'
+ self.pattern += r'\\\bsubsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|'
+ self.pattern += r'\\\bparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}'
+ self.pattern += r'\\\bsubparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}'
+ self.pattern += r')'
+
+ self.drop_no_head = drop_no_head
+
+ def process(self, sample):
+
+ if not re.search(self.pattern, sample[self.text_key], flags=re.DOTALL):
+ if self.drop_no_head:
+ sample[self.text_key] = ''
+ return sample
+
+ sample[self.text_key] = re.sub(pattern=self.pattern,
+ repl=r'\2',
+ string=sample[self.text_key],
+ flags=re.DOTALL)
+ return sample
diff --git a/data_juicer/ops/mapper/remove_long_words_mapper.py b/data_juicer/ops/mapper/remove_long_words_mapper.py
new file mode 100644
index 000000000..92ac8fe2d
--- /dev/null
+++ b/data_juicer/ops/mapper/remove_long_words_mapper.py
@@ -0,0 +1,53 @@
+# Some code here has been modified from:
+# https://huggingface.co/spaces/huggingface/text-data-filtering
+# --------------------------------------------------------
+
+import sys
+
+from jsonargparse.typing import PositiveInt
+
+from ..base_op import OPERATORS, Mapper
+from ..common import (SPECIAL_CHARACTERS, merge_on_whitespace_tab_newline,
+ split_on_newline_tab_whitespace, strip)
+
+
+@OPERATORS.register_module('remove_long_words_mapper')
+class RemoveLongWordsMapper(Mapper):
+ """Mapper to remove long words within a specific range."""
+
+ def __init__(self,
+ min_len: PositiveInt = 1,
+ max_len: PositiveInt = sys.maxsize,
+ *args,
+ **kwargs):
+ """
+ Initialization method.
+
+ :param min_len: The min mapper word length in this op, words
+ will be filtered if their length is below this parameter.
+ :param max_len: The max mapper word length in this op, words
+ will be filtered if their length exceeds this parameter.
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ super().__init__(*args, **kwargs)
+ self.min_len = min_len
+ self.max_len = max_len
+
+ def should_keep_long_word(self, word):
+ if self.min_len <= len(word) <= self.max_len:
+ return True
+ elif self.min_len <= len(strip(word,
+ SPECIAL_CHARACTERS)) <= self.max_len:
+ return True
+ else:
+ return False
+
+ def process(self, sample):
+
+ sentences = split_on_newline_tab_whitespace(sample[self.text_key])
+ sentences = [[[
+ word for word in subsentence if self.should_keep_long_word(word)
+ ] for subsentence in sentence] for sentence in sentences]
+ sample[self.text_key] = merge_on_whitespace_tab_newline(sentences)
+ return sample
diff --git a/data_juicer/ops/mapper/remove_specific_chars_mapper.py b/data_juicer/ops/mapper/remove_specific_chars_mapper.py
new file mode 100644
index 000000000..99e15afef
--- /dev/null
+++ b/data_juicer/ops/mapper/remove_specific_chars_mapper.py
@@ -0,0 +1,40 @@
+from typing import List, Union
+
+import regex as re
+
+from ..base_op import OPERATORS, Mapper
+
+
+@OPERATORS.register_module('remove_specific_chars_mapper')
+class RemoveSpecificCharsMapper(Mapper):
+ """Mapper to clean specific chars in text samples."""
+
+ def __init__(self,
+ chars_to_remove: Union[str, List[str]] = '◆●■►▼▲▴∆▻▷❖♡□',
+ *args,
+ **kwargs):
+ """
+ Initialization method.
+
+ :param chars_to_remove: a list or a string including all
+ characters that need to be removed from text.
+ :param args: extra args
+ :param kwargs: extra args
+ """
+
+ super().__init__(*args, **kwargs)
+ if chars_to_remove:
+ self.pattern = '[' + '|'.join(chars_to_remove) + ']'
+ else:
+ self.pattern = None
+
+ def process(self, sample):
+
+ if self.pattern is None:
+ return sample
+
+ sample[self.text_key] = re.sub(pattern=self.pattern,
+ repl=r'',
+ string=sample[self.text_key],
+ flags=re.DOTALL)
+ return sample
diff --git a/data_juicer/ops/mapper/remove_table_text_mapper.py b/data_juicer/ops/mapper/remove_table_text_mapper.py
new file mode 100644
index 000000000..4f6dfb233
--- /dev/null
+++ b/data_juicer/ops/mapper/remove_table_text_mapper.py
@@ -0,0 +1,45 @@
+import regex as re
+from jsonargparse.typing import restricted_number_type
+
+from ..base_op import OPERATORS, Mapper
+
+from_2_to_20 = restricted_number_type('from_2_to_20', int, [('>=', 2),
+ ('<=', 20)])
+
+
+@OPERATORS.register_module('remove_table_text_mapper')
+class RemoveTableTextMapper(Mapper):
+ """
+ Mapper to remove table texts from text samples.
+
+ Regular expression is used to remove tables in the range of column
+ number of tables.
+ """
+
+ def __init__(self,
+ min_col: from_2_to_20 = 2,
+ max_col: from_2_to_20 = 20,
+ *args,
+ **kwargs):
+ """
+ Initialization method.
+
+ :param min_col: The min number of columns of table to remove.
+ :param max_col: The max number of columns of table to remove.
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ super().__init__(*args, **kwargs)
+ self.min_col = min_col
+ self.max_col = max_col
+ self.pattern = r'(?<=\n)((\S+?)([ |\t](\S+?)){%d}\n+){2,}'
+
+ def process(self, sample):
+
+ text = sample[self.text_key]
+ for i in range(self.min_col - 1, self.max_col):
+ pattern = re.compile(self.pattern % i)
+ text = pattern.sub('', text)
+
+ sample[self.text_key] = text
+ return sample
diff --git a/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py b/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py
new file mode 100644
index 000000000..4f92b6f43
--- /dev/null
+++ b/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py
@@ -0,0 +1,66 @@
+from jsonargparse.typing import List
+
+from data_juicer.utils.model_utils import MODEL_ZOO, prepare_model
+
+from ..base_op import OPERATORS, Mapper
+from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
+ merge_on_whitespace_tab_newline,
+ split_on_newline_tab_whitespace, strip)
+
+
+@OPERATORS.register_module('remove_words_with_incorrect_substrings_mapper')
+class RemoveWordsWithIncorrectSubstringsMapper(Mapper):
+ """Mapper to remove words with incorrect substrings."""
+
+ def __init__(self,
+ lang: str = 'en',
+ tokenization: bool = False,
+ substrings: List = None,
+ *args,
+ **kwargs):
+ """
+ Initialization method.
+
+ :param lang: sample in which language
+ :param tokenization: whether to use model to tokenize documents
+ :param substrings: The incorrect substrings in words.
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ if substrings is None:
+ substrings = ['http', 'www', '.com', 'href', '//']
+ super().__init__(*args, **kwargs)
+ self.tokenization = tokenization
+ self.substrings = substrings
+ if tokenization:
+ self.model_key = prepare_model(lang=lang,
+ model_type='sentencepiece')
+
+ def should_keep_word_with_incorrect_substrings(self, word, substrings):
+ word = strip(word, SPECIAL_CHARACTERS)
+ should_keep = all([(i_substr not in word) for i_substr in substrings])
+ return should_keep
+
+ def process(self, sample):
+ if self.tokenization:
+ tokenizer = MODEL_ZOO.get(self.model_key, None)
+ sentences = get_words_from_document(
+ sample[self.text_key],
+ token_func=tokenizer.encode_as_pieces if tokenizer else None,
+ lower_case=False)
+ words = [
+ word.replace('▁', '') for word in sentences
+ if self.should_keep_word_with_incorrect_substrings(
+ word.replace('▁', ''), self.substrings)
+ ]
+ if len(words) != len(sentences):
+ sample[self.text_key] = ''.join(words)
+ else:
+ sentences = split_on_newline_tab_whitespace(sample[self.text_key])
+ sentences = [[[
+ word for word in subsentence
+ if self.should_keep_word_with_incorrect_substrings(
+ word, self.substrings)
+ ] for subsentence in sentence] for sentence in sentences]
+ sample[self.text_key] = merge_on_whitespace_tab_newline(sentences)
+ return sample
diff --git a/data_juicer/ops/mapper/sentence_split_mapper.py b/data_juicer/ops/mapper/sentence_split_mapper.py
new file mode 100644
index 000000000..368d2ba92
--- /dev/null
+++ b/data_juicer/ops/mapper/sentence_split_mapper.py
@@ -0,0 +1,28 @@
+from data_juicer.utils.model_utils import MODEL_ZOO, prepare_model
+
+from ..base_op import OPERATORS, Mapper
+from ..common import get_sentences_from_document
+
+
+@OPERATORS.register_module('sentence_split_mapper')
+class SentenceSplitMapper(Mapper):
+ """Mapper to split text samples to sentences."""
+
+ def __init__(self, lang: str = 'en', *args, **kwargs):
+ """
+ Initialization method.
+
+ :param lang: split sentence of text in which language.
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ super().__init__(*args, **kwargs)
+ self.model_key = prepare_model(lang=lang, model_type='nltk')
+
+ def process(self, sample):
+
+ nltk_model = MODEL_ZOO.get(self.model_key, None)
+ sample[self.text_key] = get_sentences_from_document(
+ sample[self.text_key],
+ model_func=nltk_model.tokenize if nltk_model else None)
+ return sample
diff --git a/data_juicer/ops/mapper/whitespace_normalization_mapper.py b/data_juicer/ops/mapper/whitespace_normalization_mapper.py
new file mode 100644
index 000000000..a81d60f0c
--- /dev/null
+++ b/data_juicer/ops/mapper/whitespace_normalization_mapper.py
@@ -0,0 +1,41 @@
+# Most of the code here has been modified from:
+# https://github.com/bigscience-workshop/data-preparation
+# --------------------------------------------------------
+
+from ..base_op import OPERATORS, Mapper
+
+
+@OPERATORS.register_module('whitespace_normalization_mapper')
+class WhitespaceNormalizationMapper(Mapper):
+ """
+ Mapper to normalize different kinds of whitespaces to whitespace ' ' (0x20)
+ in text samples.
+
+ Different kinds of whitespaces can be found here:
+ https://en.wikipedia.org/wiki/Whitespace_character
+ """
+
+ def __init__(self, *args, **kwargs):
+ """
+ Initialization method.
+
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ # whitespaces in unicode can be found here:
+ # https://en.wikipedia.org/wiki/Whitespace_character
+ super().__init__(*args, **kwargs)
+ self.whitespaces = {
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
+ ' ', ' ', ' ', ' ', '', '', '', '', '', ''
+ }
+
+ def process(self, sample):
+ # remove whitespaces before and after the main content
+ text = sample[self.text_key].strip()
+
+ # replace all kinds of whitespaces with ' '
+ sample[self.text_key] = ''.join(
+ [char if char not in self.whitespaces else ' ' for char in text])
+
+ return sample
diff --git a/data_juicer/ops/selector/__init__.py b/data_juicer/ops/selector/__init__.py
new file mode 100644
index 000000000..cf0977321
--- /dev/null
+++ b/data_juicer/ops/selector/__init__.py
@@ -0,0 +1 @@
+from . import frequency_specified_field_selector, topk_specified_field_selector
diff --git a/data_juicer/ops/selector/frequency_specified_field_selector.py b/data_juicer/ops/selector/frequency_specified_field_selector.py
new file mode 100644
index 000000000..937642642
--- /dev/null
+++ b/data_juicer/ops/selector/frequency_specified_field_selector.py
@@ -0,0 +1,87 @@
+import numbers
+
+from jsonargparse.typing import ClosedUnitInterval, PositiveInt
+
+from ..base_op import OPERATORS, Selector
+
+
+@OPERATORS.register_module('frequency_specified_field_selector')
+class FrequencySpecifiedFieldSelector(Selector):
+ """Selector to select samples based on the sorted frequency of specified
+ field."""
+
+ def __init__(self,
+ text_key: str = '',
+ top_ratio: ClosedUnitInterval = None,
+ topk: PositiveInt = None,
+ reverse: bool = True,
+ *args,
+ **kwargs):
+ """
+ Initialization method.
+
+ :param text_key: Selector based on the specified value
+ corresponding to the target key. The target key
+ corresponding to multi-level field information need to be
+ separated by '.'.
+ :param top_ratio: Ratio of selected top specified field value,
+ samples will be selected if their specified field values are
+ within this parameter. When both topk and top_ratio are set,
+ the value corresponding to the smaller number of samples
+ will be applied.
+ :param topk: Number of selected top specified field value,
+ samples will be selected if their specified field values are
+ within this parameter. When both topk and top_ratio are set,
+ the value corresponding to the smaller number of samples
+ will be applied.
+ :param reverse: Determine the sorting rule, if reverse=True,
+ then sort in descending order.
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ super().__init__(*args, **kwargs)
+ self.text_key = text_key
+ self.top_ratio = top_ratio
+ self.topk = topk
+ self.reverse = reverse
+
+ def process(self, dataset):
+ if len(dataset) <= 1 or not self.text_key:
+ return dataset
+
+ text_keys = self.text_key.split('.')
+ assert text_keys[0] in dataset.features.keys(
+ ), "'{}' not in {}".format(text_keys[0], dataset.features.keys())
+
+ field_value_dict = {}
+ for i, item in enumerate(dataset[text_keys[0]]):
+ field_value = item
+ for key in text_keys[1:]:
+ assert key in field_value.keys(), "'{}' not in {}".format(
+ key, field_value.keys())
+ field_value = field_value[key]
+ assert field_value is None or isinstance(
+ field_value, str) or isinstance(
+ field_value, numbers.Number
+ ), 'The {} item is not String, Numbers or NoneType'.format(i)
+ if field_value not in field_value_dict.keys():
+ field_value_dict[field_value] = [i]
+ else:
+ field_value_dict[field_value].append(i)
+
+ select_num = 0
+ if not self.top_ratio:
+ if not self.topk:
+ return dataset
+ else:
+ select_num = self.topk
+ else:
+ select_num = self.top_ratio * len(field_value_dict)
+ if self.topk and self.topk < select_num:
+ select_num = self.topk
+
+ select_index = sum(
+ sorted(field_value_dict.values(),
+ key=lambda x: len(x),
+ reverse=self.reverse)[:int(select_num)], [])
+ return dataset.select(select_index)
diff --git a/data_juicer/ops/selector/topk_specified_field_selector.py b/data_juicer/ops/selector/topk_specified_field_selector.py
new file mode 100644
index 000000000..cdcd425a3
--- /dev/null
+++ b/data_juicer/ops/selector/topk_specified_field_selector.py
@@ -0,0 +1,97 @@
+import heapq
+import sys
+
+from jsonargparse.typing import ClosedUnitInterval, PositiveInt
+
+from ..base_op import OPERATORS, Selector
+
+
+def to_number(s, reverse=True):
+ try:
+ return float(s)
+ except Exception:
+ if reverse:
+ return -sys.maxsize
+ else:
+ return sys.maxsize
+
+
+@OPERATORS.register_module('topk_specified_field_selector')
+class TopkSpecifiedFieldSelector(Selector):
+ """Selector to select top samples based on the sorted specified field
+ value."""
+
+ def __init__(self,
+ text_key: str = '',
+ top_ratio: ClosedUnitInterval = None,
+ topk: PositiveInt = None,
+ reverse: bool = True,
+ *args,
+ **kwargs):
+ """
+ Initialization method.
+
+ :param text_key: Selector based on the specified value
+ corresponding to the target key. The target key
+ corresponding to multi-level field information need to be
+ separated by '.'.
+ :param top_ratio: Ratio of selected top samples, samples will be
+ selected if their specified field values are within this
+ parameter. When both topk and top_ratio are set, the value
+ corresponding to the smaller number of samples will be
+ applied.
+ :param topk: Number of selected top sample, samples will be
+ selected if their specified field values are within this
+ parameter. When both topk and top_ratio are set, the value
+ corresponding to the smaller number of samples will be
+ applied.
+ :param reverse: Determine the sorting rule, if reverse=True,
+ then sort in descending order.
+ :param args: extra args
+ :param kwargs: extra args
+ """
+ super().__init__(*args, **kwargs)
+ self.text_key = text_key
+ self.top_ratio = top_ratio
+ self.topk = topk
+ self.reverse = reverse
+
+ def process(self, dataset):
+ if len(dataset) <= 1 or not self.text_key:
+ return dataset
+
+ select_num = 0
+ if not self.top_ratio:
+ if not self.topk:
+ return dataset
+ else:
+ select_num = self.topk
+ else:
+ select_num = self.top_ratio * len(dataset)
+ if self.topk and self.topk < select_num:
+ select_num = self.topk
+
+ text_keys = self.text_key.split('.')
+ assert text_keys[0] in dataset.features.keys(
+ ), "'{}' not in {}".format(text_keys[0], dataset.features.keys())
+
+ if len(text_keys) == 1:
+ field_value_list = dataset[text_keys[0]]
+ else:
+ field_value_list = []
+ for item in dataset[text_keys[0]]:
+ field_value = item
+ for key in text_keys[1:]:
+ assert key in field_value.keys(), "'{}' not in {}".format(
+ key, field_value.keys())
+ field_value = field_value[key]
+ field_value_list.append(to_number(field_value, self.reverse))
+
+ if self.reverse:
+ select_index = heapq.nlargest(int(select_num), range(len(dataset)),
+ field_value_list.__getitem__)
+ else:
+ select_index = heapq.nsmallest(int(select_num),
+ range(len(dataset)),
+ field_value_list.__getitem__)
+ return dataset.select(select_index)
diff --git a/data_juicer/utils/__init__.py b/data_juicer/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/data_juicer/utils/asset_utils.py b/data_juicer/utils/asset_utils.py
new file mode 100644
index 000000000..5577c1fe7
--- /dev/null
+++ b/data_juicer/utils/asset_utils.py
@@ -0,0 +1,58 @@
+import json
+import os
+
+import requests
+from loguru import logger
+
+from .cache_utils import DATA_JUICER_ASSETS_CACHE
+
+# Default directory to store auxiliary resources
+ASSET_DIR = DATA_JUICER_ASSETS_CACHE
+
+# Default cached assets links for downloading
+ASSET_LINKS = {
+ 'flagged_words':
+ 'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/'
+ 'data_juicer/flagged_words.json',
+ 'stopwords':
+ 'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/'
+ 'data_juicer/stopwords.json',
+}
+
+
+def load_words_asset(words_dir: str, words_type: str):
+ """
+ Load words from a asset file named `words_type`, if not find a valid asset
+ file, then download it from ASSET_LINKS cached by data_juicer team.
+
+ :param words_dir: directory that stores asset file(s)
+ :param words_type: name of target words assets
+ :return: a dict that stores words assets, whose keys are language
+ names, and the values are lists of words
+ """
+ words_dict = {}
+ os.makedirs(words_dir, exist_ok=True)
+
+ # try to load words from `words_type` file
+ for filename in os.listdir(words_dir):
+ if filename.endswith('.json') and words_type in filename:
+ with open(os.path.join(words_dir, filename), 'r') as file:
+ loaded_words = json.load(file)
+ for key in loaded_words:
+ if key in words_dict:
+ words_dict[key] += loaded_words[key]
+ else:
+ words_dict[key] = loaded_words[key]
+ # if the asset file is not found, then download it from ASSET_LINKS
+ if not bool(words_dict):
+ logger.info(f'Specified {words_dir} does not contain '
+ f'any {words_type} files in json format, now '
+ 'download the one cached by data_juicer team')
+ response = requests.get(ASSET_LINKS[words_type])
+ words_dict = response.json()
+ # cache the asset file locally
+ cache_path = os.path.join(words_dir, f'{words_type}.json')
+ with open(cache_path, 'w') as file:
+ json.dump(words_dict, file)
+
+ return words_dict
diff --git a/data_juicer/utils/cache_utils.py b/data_juicer/utils/cache_utils.py
new file mode 100644
index 000000000..8ee05a624
--- /dev/null
+++ b/data_juicer/utils/cache_utils.py
@@ -0,0 +1,21 @@
+import os
+
+# Default cache location
+DEFAULT_CACHE_HOME = '~/.cache'
+CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME)
+
+# Default data_juicer cache location
+DEFAULT_DATA_JUICER_CACHE_HOME = os.path.join(CACHE_HOME, 'data_juicer')
+DATA_JUICER_CACHE_HOME = os.path.expanduser(
+ os.getenv('DATA_JUICER_CACHE_HOME', DEFAULT_DATA_JUICER_CACHE_HOME))
+
+# Default assets cache location
+DEFAULT_DATA_JUICER_ASSETS_CACHE = os.path.join(DATA_JUICER_CACHE_HOME,
+ 'assets')
+DATA_JUICER_ASSETS_CACHE = os.getenv('DATA_JUICER_ASSETS_CACHE',
+ DEFAULT_DATA_JUICER_ASSETS_CACHE)
+# Default models cache location
+DEFAULT_DATA_JUICER_MODELS_CACHE = os.path.join(DATA_JUICER_CACHE_HOME,
+ 'models')
+DATA_JUICER_MODELS_CACHE = os.getenv('DATA_JUICER_MODELS_CACHE',
+ DEFAULT_DATA_JUICER_MODELS_CACHE)
diff --git a/data_juicer/utils/ckpt_utils.py b/data_juicer/utils/ckpt_utils.py
new file mode 100644
index 000000000..b4e0e636f
--- /dev/null
+++ b/data_juicer/utils/ckpt_utils.py
@@ -0,0 +1,128 @@
+import json
+import os
+
+from datasets import Dataset
+from loguru import logger
+
+
+class CheckpointManager:
+ """
+ This class is used to save the latest version of dataset to checkpint
+ directory or load it from checkpint directory, a bit like cache management
+ Rerun the same config will reload the checkpoint and skip ops before it.
+
+ If any args of operator in process list is changed, all ops will be
+ rerun from the beginning.
+ """
+
+ def __init__(self, ckpt_dir, original_process_list, num_proc=1):
+ """
+ Initialization method.
+
+ :param ckpt_dir: path to save and load checkpoint
+ :param original_process_list: process list in config
+ :param num_proc: number of process workers when saving dataset
+ """
+ self.ckpt_dir = ckpt_dir
+ self.ckpt_ds_dir = os.path.join(self.ckpt_dir, 'latest')
+ self.ckpt_op_record = os.path.join(self.ckpt_dir, 'ckpt_op.json')
+ self.process_list = original_process_list
+ self.num_proc = num_proc
+ self.op_record = []
+
+ self.ckpt_available = self.check_ckpt()
+
+ def get_left_process_list(self):
+ """
+ Get left process list of ops for processing dataset, when checkpoint is
+ available, remove some ops from process list, otherwise keep it
+ unchanged.
+
+ :return: process list of left ops
+ """
+ return self.process_list
+
+ def check_ckpt(self):
+ """
+ Check if checkpoint is available.
+
+ :return: True when checkpoint is available, else False
+ """
+ if os.path.exists(self.ckpt_ds_dir) \
+ and os.path.isdir(self.ckpt_ds_dir) \
+ and os.path.exists(self.ckpt_op_record) \
+ and os.path.isfile(self.ckpt_op_record) \
+ and self.check_ops_to_skip():
+ return True
+ else:
+ os.makedirs(self.ckpt_dir, exist_ok=True)
+ return False
+
+ def record(self, op_name, op_args):
+ """Save op name and args to op record, which is used to compare with
+ the process list from config to decide if a checkpoint is available."""
+ self.op_record.append({op_name: op_args})
+
+ def check_ops_to_skip(self):
+ """
+ Check which ops need to be skipped in the process list.
+
+ If op record list from checkpoint are the same as the prefix
+ part of process list, then skip these ops and start processing
+ from the checkpoint. Otherwise, process the original dataset
+ from scratch.
+
+ :return: whether to skip somme ops or not
+ """
+
+ # load op records
+ with open(self.ckpt_op_record, 'r') as fin:
+ self.op_record = json.load(fin)
+
+ # check whether the op records are exactly the same
+ # with prefix of process list
+ # 1. same: remove these ops from process list
+ # 2. different: cleanup op record, and keep process list unchanged
+ recorded_op_num = len(self.op_record)
+ prefix_process = self.process_list[:recorded_op_num]
+ all_the_same = True
+ dif1, dif2 = None, None
+
+ for record_op, config_op in zip(self.op_record, prefix_process):
+ if record_op != config_op:
+ all_the_same = False
+ dif1, dif2 = record_op, config_op
+ break
+ if all_the_same:
+ for op in self.op_record:
+ op_name = list(op.keys())[0]
+ logger.info(f'Skip op [{op_name}].')
+ self.process_list = self.process_list[recorded_op_num:]
+ return True
+ else:
+ logger.warning(f'Processed ops of checkpoint are different from '
+ f'current configs: checkpoint-{dif1} vs. config-'
+ f'{dif2}. All ops will be processed from the '
+ f'beginning')
+ self.op_record = []
+ return False
+
+ def save_ckpt(self, ds):
+ """
+ Save dataset to checkpoint directory and dump processed ops list.
+
+ :param ds: input dataset to save
+ """
+ ds.save_to_disk(self.ckpt_ds_dir, num_proc=self.num_proc)
+
+ with open(self.ckpt_op_record, 'w') as fout:
+ json.dump(self.op_record, fout)
+
+ def load_ckpt(self):
+ """
+ Load dataset from a checkpoint file.
+
+ :return: a dataset stored in checkpoint file.
+ """
+ ds = Dataset.load_from_disk(self.ckpt_ds_dir)
+ return ds
diff --git a/data_juicer/utils/file_utils.py b/data_juicer/utils/file_utils.py
new file mode 100644
index 000000000..a78572a53
--- /dev/null
+++ b/data_juicer/utils/file_utils.py
@@ -0,0 +1,69 @@
+from pathlib import Path
+from typing import List, Tuple, Union
+
+from datasets.utils.extract import ZstdExtractor as Extractor
+
+
+def find_files_with_suffix(
+ path: Union[str, Path],
+ suffixes: Union[str, List[str], Tuple[str]] = None) -> List[str]:
+ """
+ Traverse a path to find all files with the specified suffixes.
+
+ :param path: path (str/Path): source path
+ :param suffixes: specified file suffixes, '.txt' or ['.txt', '.md']
+ etc
+ :return: list of all files with the specified suffixes
+ """
+ path = Path(path)
+ file_dict = {}
+
+ if suffixes is None:
+ suffixes = []
+
+ if isinstance(suffixes, str):
+ suffixes = [suffixes]
+
+ suffixes = [
+ x.lower() if x.startswith('.') else '.' + x.lower() for x in suffixes
+ ]
+
+ if path.is_file():
+ files = [path]
+ else:
+ searched_files = path.rglob('*')
+ files = [file for file in searched_files if file.is_file()]
+
+ extractor = Extractor
+
+ # only keep the file with the specified suffixes
+ for file in files:
+ suffix = file.suffix.lower()
+
+ if extractor.is_extractable(file):
+
+ # TODO
+ # hard code
+ # only support zstd-format file now,
+ # and use the last 2 sub-suffixes as the final suffix
+ # just like '.jsonl.zst'
+ file_suffixes = [suffix.lower() for suffix in file.suffixes]
+ suffix = ''.join(file_suffixes[-2:])
+
+ if not suffixes or (suffix in suffixes):
+ if suffix not in file_dict:
+ file_dict[suffix] = [str(file)]
+ else:
+ file_dict[suffix].append(str(file))
+ return file_dict
+
+
+def is_absolute_path(path: Union[str, Path]) -> bool:
+ """
+ Check whether input path is a absolute path.
+
+ :param path: input path
+ :return: True means input path is absolute path, False means input
+ path is a relative path.
+ """
+ return Path(path).is_absolute()
diff --git a/data_juicer/utils/logger_utils.py b/data_juicer/utils/logger_utils.py
new file mode 100644
index 000000000..d930c6a21
--- /dev/null
+++ b/data_juicer/utils/logger_utils.py
@@ -0,0 +1,132 @@
+# Some codes here are adapted from
+# https://github.com/MegEngine/YOLOX/blob/main/yolox/utils/logger.py
+
+# Copyright 2021 Megvii, Base Detection
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+import sys
+
+from loguru import logger
+from loguru._file_sink import FileSink
+
+LOGGER_SETUP = False
+
+
+def get_caller_name(depth=0):
+ """
+ Get caller name by depth.
+
+ :param depth: depth of caller context, use 0 for caller depth.
+ :return: module name of the caller
+ """
+ # the following logic is a little bit faster than inspect.stack() logic
+ frame = inspect.currentframe().f_back
+ for _ in range(depth):
+ frame = frame.f_back
+
+ return frame.f_globals['__name__']
+
+
+class StreamToLoguru:
+ """Stream object that redirects writes to a logger instance."""
+
+ def __init__(self, level='INFO', caller_names=('datasets', 'logging')):
+ """
+ Initialization method.
+
+ :param level: log level string of loguru. Default value: "INFO".
+ :param caller_names: caller names of redirected module.
+ Default value: (apex, pycocotools).
+ """
+ self.level = level
+ self.linebuf = ''
+ self.caller_names = caller_names
+
+ def write(self, buf):
+ full_name = get_caller_name(depth=1)
+ module_name = full_name.rsplit('.', maxsplit=-1)[0]
+ if module_name in self.caller_names:
+ for line in buf.rstrip().splitlines():
+ # use caller level log
+ logger.opt(depth=2).log(self.level, line.rstrip())
+ else:
+ # sys.__stdout__.write(buf)
+ logger.opt(raw=True).info(buf)
+
+ def flush(self):
+ pass
+
+
+def redirect_sys_output(log_level='INFO'):
+ """
+ Redirect stdout/stderr to loguru with log level.
+
+ :param log_level: log level string of loguru. Default value: "INFO".
+ """
+ redirect_logger = StreamToLoguru(log_level)
+ sys.stderr = redirect_logger
+ sys.stdout = redirect_logger
+
+
+def get_log_file_path():
+ """
+ Get the path to the location of the log file.
+
+ :return: a location of log file.
+ """
+ for _, handler in logger._core.handlers.items():
+ if isinstance(handler._sink, FileSink):
+ return handler._sink._file.name
+
+
+def setup_logger(save_dir, distributed_rank=0, filename='log.txt', mode='o'):
+ """
+ Setup logger for training and testing.
+
+ :param save_dir: location to save log file
+ :param distributed_rank: device rank when multi-gpu environment
+ :param filename: log file name to save
+ :param mode: log file write mode, `append` or `override`. default is `o`.
+ :return: logger instance.
+ """
+ global LOGGER_SETUP
+
+ if LOGGER_SETUP:
+ return
+
+ loguru_format = (
+ '{time:YYYY-MM-DD HH:mm:ss} | '
+ '{level: <8} | '
+ '{name}:{line} - {message}')
+
+ logger.remove()
+ save_file = os.path.join(save_dir, filename)
+ if mode == 'o' and os.path.exists(save_file):
+ os.remove(save_file)
+
+ # only keep logger in rank0 process
+ if distributed_rank == 0:
+ logger.add(
+ sys.stderr,
+ format=loguru_format,
+ level='INFO',
+ enqueue=True,
+ )
+ logger.add(save_file)
+
+ # redirect stdout/stderr to loguru
+ redirect_sys_output('INFO')
+ LOGGER_SETUP = True
diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py
new file mode 100644
index 000000000..8460010b1
--- /dev/null
+++ b/data_juicer/utils/model_utils.py
@@ -0,0 +1,204 @@
+import os
+
+import wget
+from loguru import logger
+
+from .cache_utils import DATA_JUICER_MODELS_CACHE
+
+# Default directory to store models
+MODEL_PATH = DATA_JUICER_MODELS_CACHE
+
+# Default backup cached models links for downloading
+BACKUP_MODEL_LINKS = {
+ # language identification model from fasttext
+ 'lid.176.bin':
+ 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/',
+
+ # tokenizer and language model for English from sentencepiece and KenLM
+ '%s.sp.model':
+ 'https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/',
+ '%s.arpa.bin':
+ 'https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/',
+
+ # sentence split model from nltk punkt
+ 'punkt.%s.pickle':
+ 'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/'
+ 'data_juicer/models/'
+}
+
+# Default cached models links for downloading
+MODEL_LINKS = 'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/' \
+ 'data_juicer/models/'
+
+MODEL_ZOO = {}
+
+
+def check_model(model_name, args=(), force=False):
+ """
+ Check whether a model exists in MODEL_PATH. If exists, return its full path
+ Else, download it from cached models links.
+
+ :param model_name: a specified model name
+ :param args: optional extra args of model.
+ :param force: Whether to download model forcefully or not, Sometimes
+ the model file maybe incomplete for some reason, so need to
+ download again forcefully.
+ """
+ if not os.path.exists(MODEL_PATH):
+ os.makedirs(MODEL_PATH)
+
+ # check if the specified model exists. If it does not exist, download it
+ true_model_name = model_name % args
+ mdp = os.path.join(MODEL_PATH, true_model_name)
+ if force:
+ if os.path.exists(mdp):
+ os.remove(mdp)
+ logger.info(
+ f'Model [{true_model_name}] invalid, force to downloading...')
+ else:
+ logger.info(
+ f'Model [{true_model_name}] not found . Downloading...')
+
+ try:
+ model_link = os.path.join(MODEL_LINKS, true_model_name)
+ wget.download(model_link, mdp, bar=None)
+ except: # noqa: E722
+ try:
+ backup_model_link = os.path.join(
+ BACKUP_MODEL_LINKS[model_name], true_model_name)
+ wget.download(backup_model_link, mdp, bar=None)
+ except: # noqa: E722
+ logger.error(
+ f'Downloading model [{true_model_name}] error. '
+ f'Please retry later or download it into {MODEL_PATH} '
+ f'manually from {model_link} or {backup_model_link} ')
+ exit(1)
+ return mdp
+
+
+def prepare_fasttext_model(model_name):
+ """
+ Prepare and load a fasttext model.
+
+ :param model_name: input model name
+ :return: model instance.
+ """
+ import fasttext
+ logger.info('Loading fasttext language identification model...')
+ try:
+ ft_model = fasttext.load_model(check_model(model_name))
+ except: # noqa: E722
+ ft_model = fasttext.load_model(check_model(model_name, force=True))
+ return ft_model
+
+
+def prepare_sentencepiece_model(model_name, lang):
+ """
+ Prepare and load a sentencepiece model.
+
+ :param model_name: input model name in formatting syntax
+ :param lang: language to render model name
+ :return: model instance.
+ """
+ import sentencepiece
+ logger.info('Loading sentencepiece model...')
+ sentencepiece_model = sentencepiece.SentencePieceProcessor()
+ try:
+ sentencepiece_model.load(check_model(model_name, lang))
+ except: # noqa: E722
+ sentencepiece_model.load(check_model(model_name, lang, force=True))
+ return sentencepiece_model
+
+
+def prepare_kenlm_model(model_name, lang):
+ """
+ Prepare and load a kenlm model.
+
+ :param model_name: input model name in formatting syntax.
+ :param lang: language to render model name
+ :return: model instance.
+ """
+ import kenlm
+ logger.info('Loading kenlm language model...')
+ try:
+ kenlm_model = kenlm.Model(check_model(model_name, lang))
+ except: # noqa: E722
+ kenlm_model = kenlm.Model(check_model(model_name, lang, force=True))
+ return kenlm_model
+
+
+def prepare_nltk_model(model_name, lang):
+ """
+ Prepare and load a nltk punkt model.
+
+ :param model_name: input model name in formatting syntax
+ :param lang: language to render model name
+ :return: model instance.
+ """
+
+ nltk_to_punkt = {
+ 'en': 'english',
+ 'fr': 'french',
+ 'pt': 'portuguese',
+ 'es': 'spanish'
+ }
+ assert lang in nltk_to_punkt.keys(
+ ), 'lang must be one of the following: {}'.format(
+ list(nltk_to_punkt.keys()))
+
+ from nltk.data import load
+ logger.info('Loading nltk punkt split model...')
+ try:
+ nltk_model = load(check_model(model_name, nltk_to_punkt[lang]))
+ except: # noqa: E722
+ nltk_model = load(
+ check_model(model_name, nltk_to_punkt[lang], force=True))
+ return nltk_model
+
+
+def prepare_huggingface_tokenizer(tokenizer_name):
+ """
+ Prepare and load a tokenizer from HuggingFace.
+
+ :param tokenizer_name: input tokenizer name
+ :return: a tokenizer instance.
+ """
+ from transformers import AutoTokenizer
+ logger.info('Loading tokenizer from HuggingFace...')
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+ return tokenizer
+
+
+def prepare_model(lang='en', model_type='sentencepiece', model_key=None):
+ """
+ Prepare and load a model or a tokenizer from MODEL_ZOO.
+
+ :param lang: which lang model to load
+ :param model_type: model or tokenizer type
+ :param model_key: tokenizer name, only used when prepare HuggingFace
+ tokenizer
+ :return: a model or tokenizer instance
+ """
+
+ type_to_name = {
+ 'fasttext': ('lid.176.bin', prepare_fasttext_model),
+ 'sentencepiece': ('%s.sp.model', prepare_sentencepiece_model),
+ 'kenlm': ('%s.arpa.bin', prepare_kenlm_model),
+ 'nltk': ('punkt.%s.pickle', prepare_nltk_model),
+ 'huggingface': ('%s', prepare_huggingface_tokenizer)
+ }
+ assert model_type in type_to_name.keys(
+ ), 'model_type must be one of the following: {}'.format(
+ list(type_to_name.keys()))
+
+ if model_key is None:
+ model_key = model_type + '_' + lang
+ if model_key not in MODEL_ZOO.keys():
+ model_name, model_func = type_to_name[model_type]
+ if model_type == 'fasttext':
+ MODEL_ZOO[model_key] = model_func(model_name)
+ elif model_type == 'huggingface':
+ MODEL_ZOO[model_key] = model_func(model_key)
+ else:
+ MODEL_ZOO[model_key] = model_func(model_name, lang)
+ return model_key
diff --git a/data_juicer/utils/registry.py b/data_juicer/utils/registry.py
new file mode 100644
index 000000000..8847ae2d4
--- /dev/null
+++ b/data_juicer/utils/registry.py
@@ -0,0 +1,133 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# --------------------------------------------------------
+# Most of the code here has been modified from:
+# https://github.com/modelscope/modelscope/blob/master/modelscope/utils/registry.py
+# --------------------------------------------------------
+
+from loguru import logger
+
+
+class Registry(object):
+ """This class is used to register some modules to registry by a repo
+ name."""
+
+ def __init__(self, name: str):
+ """
+ Initialization method.
+
+ :param name: a registry repo name
+ """
+ self._name = name
+ self._modules = {}
+
+ @property
+ def name(self):
+ """
+ Get name of current registry.
+
+ :return: name of current registry.
+ """
+ return self._name
+
+ @property
+ def modules(self):
+ """
+ Get all modules in current registry.
+
+ :return: a dict storing modules in current registry.
+ """
+ return self._modules
+
+ def list(self):
+ """Logging the list of module in current registry."""
+ for m in self._modules.keys():
+ logger.info(f'{self._name}\t{m}')
+
+ def get(self, module_key):
+ """
+ Get module named module_key from in current registry. If not found,
+ return None.
+
+ :param module_key: specified module name
+ :return: module named module_key
+ """
+ return self._modules.get(module_key, None)
+
+ def _register_module(self, module_name=None, module_cls=None, force=False):
+ """
+ Register module to registry.
+
+ :param module_name: module name
+ :param module_cls: module class object
+ :param force: Whether to override an existing class with the
+ same name. Default: False.
+ """
+
+ if module_name is None:
+ module_name = module_cls.__name__
+
+ if module_name in self._modules and not force:
+ raise KeyError(
+ f'{module_name} is already registered in {self._name}')
+
+ self._modules[module_name] = module_cls
+
+ def register_module(self,
+ module_name: str = None,
+ module_cls: type = None,
+ force=False):
+ """
+ Register module class object to registry with the specified modulename.
+
+ :param module_name: module name
+ :param module_cls: module class object
+ :param force: Whether to override an existing class with
+ the same name. Default: False.
+
+ Example:
+ >>> registry = Registry()
+ >>> @registry.register_module()
+ >>> class TextFormatter:
+ >>> pass
+
+ >>> class TextFormatter2:
+ >>> pass
+ >>> registry.register_module( module_name='text_formatter2',
+ module_cls=TextFormatter2)
+ """
+ if not (module_name is None or isinstance(module_name, str)):
+ raise TypeError(f'module_name must be either of None, str,'
+ f'got {type(module_name)}')
+ if module_cls is not None:
+ self._register_module(module_name=module_name,
+ module_cls=module_cls,
+ force=force)
+ return module_cls
+
+ # if module_cls is None, should return a decorator function
+ def _register(module_cls):
+ """
+ Register module class object to registry.
+
+ :param module_cls: module class object
+ :return: module class object.
+ """
+ self._register_module(module_name=module_name,
+ module_cls=module_cls,
+ force=force)
+ return module_cls
+
+ return _register
diff --git a/demos/.DS_Store b/demos/.DS_Store
new file mode 100644
index 000000000..4075bd989
Binary files /dev/null and b/demos/.DS_Store differ
diff --git a/demos/README.md b/demos/README.md
new file mode 100644
index 000000000..e91dd42cf
--- /dev/null
+++ b/demos/README.md
@@ -0,0 +1,40 @@
+# Demos
+
+This folder contains some demos developed with streamlit to allow users to easily experience the basic functions and tools of Data-Juicer.
+
+## Usage
+
+```shell
+cd xxx
+streamlit run xxx/app.py
+```
+
+## Categories
+
+### Data
+
+This folder contains some demo of datasets.
+
+### Data visualization diversity
+
+This demo analyze Verb-Noun structures of SFT dataset, and draw its diversity in sunburst format.
+
+### Data visualization op effect
+
+This demo analyze the statistics of dataset, and display every Filter op effect by setting different thresholds.
+
+### Data visualization statistics
+This demo analyze the statistics (up to 13 for now) of dataset.
+
+### Tool quality classifier
+This demo supply 3 text quality classifier, and score dataset
+
+## Coming Soon
+- Overview scan
+- Auto evaluation helm
+- Data process loop
+- Data mixture
+- SFT data zh
+- Process sci data
+- Process code data
+- Data process hpo
diff --git a/demos/README_ZH.md b/demos/README_ZH.md
new file mode 100644
index 000000000..097f3436f
--- /dev/null
+++ b/demos/README_ZH.md
@@ -0,0 +1,42 @@
+# 示例文件
+
+此文件夹包含一些示例,帮助用户轻松体验 Data-Juicer各种功能和工具。
+
+## 用法
+
+```shell
+cd xxx
+streamlit run xxx/app.py
+```
+
+## 目录
+
+### Data
+
+该文件夹包含一些样例数据集。
+
+### Data visualization diversity
+
+该示例可以用来分析 SFT 数据集的动词-名词结构, 并绘制成sunburst层级环形图表。
+
+### Data visualization op effect
+
+该示例可以分析数据集的统计信息,并根据这些统计信息可以显示出每个 `Filter` 算子的在不用阈值下的效果。
+
+### Data visualization statistics
+
+该示例可以分析数据集,并获得多达13种统计信息。
+
+### Tool quality classifier
+该示例提供了3种文本质量打分器, 对数据集进行打分评估。
+
+## Coming Soon
+- Overview scan | 初体验
+- Auto evaluation helm | 自动HELM评测
+- Data process loop | 数据分析处理迭代
+- Data mixture | 数据混合
+- SFT data zh | 中文指令微调数据处理
+- Process sci data | 科学文献数据处理
+- Process code data | 代码数据处理
+- Data process hpo | 数据混合超参自动优化
+
diff --git a/demos/data/demo-dataset-content.jsonl b/demos/data/demo-dataset-content.jsonl
new file mode 100644
index 000000000..07871df3a
--- /dev/null
+++ b/demos/data/demo-dataset-content.jsonl
@@ -0,0 +1,6 @@
+{"content": "Today is Sunday and it's a happy day!", "src": "Arxiv", "date": "2023-04-27", "version": "1.0"}
+{"content": "Do you need a cup of coffee?", "src": "code", "author": "xxx"}
+{"content": "你好,请问你是谁", "src": "customized", "author": "xxx"}
+{"content": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "src": "Oscar", "version": "2.0"}
+{"content": "欢迎来到阿里巴巴!", "src": "customized", "version": "0.1", "author": "xxx"}
+{"content": "This paper proposed a novel method on LLM pretraining.", "src": "customized", "author": "xxx"}
diff --git a/demos/data/demo-dataset-deduplication.jsonl b/demos/data/demo-dataset-deduplication.jsonl
new file mode 100644
index 000000000..d2590cec3
--- /dev/null
+++ b/demos/data/demo-dataset-deduplication.jsonl
@@ -0,0 +1,14 @@
+{"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}}
+{"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}}
+{"text": "你好,请问你是谁", "meta": {"src": "customized", "author": "xxx"}}
+{"text": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "meta": {"src": "Oscar", "version": "2.0"}}
+{"text": "欢迎来到阿里巴巴!", "meta": {"src": "customized", "version": "0.1", "author": "xxx"}}
+{"text": "Today is sunday and it's really a happy day!", "meta": {"src": "Arxiv", "date": "2023-05-15", "version": "1.1"}}
+{"text": "This paper proposed a novel method on LLM pretraining.", "meta": {"src": "customized", "author": "xxx"}}
+{"text": "Smithfield employs 3,700 people at its plant in Sioux Falls, South Dakota. The plant slaughters 19,500 pigs a day — 5 percent of U.S. pork. Most of the workers are immigrants from Ethiopia, Mexico, South Sudan, Honduras, Myanmar, Somalia, Guatemala, and other poor countries.\\n\\nInevitably workers must pass within one foot of hundreds of colleagues in the hallways, locker rooms, cafeterias, and cutting lines. The same conditions have spurred Covid-19 outbreaks at meat plants from Minnesota and Wisconsin to Colorado, Nebraska, Missouri, Iowa, Pennsylvania, North Carolina, and Georgia.\\n\\n801 workers at the Sioux Falls plant have tested positive, together with 206 people close to them. The outbreak has killed Agustín Rodríguez Martínez, aged 64, an employee with two decades of experience originally from El Salvador, and Craig Allen Franken, 61, who worked for Smithfield his entire adult life.\\n\\nThe company knew of its first infection on March 24 or earlier. The virus spread exponentially for several weeks. Ahead of Easter Sunday and Monday (April 12-13), Smithfield promised to “completely shutter” to sanitize and put up cardboard and plastic sheet dividers. This would not end transmission, as potentially hundreds of staff were already carrying the virus. But even during this “shutdown,” many cars were seen in the parking lot. The mayor admits that the company lied, and the local AFL-CIO alleges the plant ran 60 percent production. On Easter, with 238 known infections, Smithfield finally agreed to shut down indefinitely after a request from the mayor and the governor. Yet the company insisted on waiting three more days to actually halt production.\\n\\nSmithfield denied contributing to the outbreak, saying it took a “very proactive approach.” Relying on racism, the company blamed workers for getting themselves sick. A spokesperson said the outbreak was so severe because of the plant’s “large immigrant population,” claming “Living circumstances in certain cultures are different than they are with your traditional American family.” They slandered the workers as dirty, ignorant, and untrustworthy with help from governor Kristi Noem, who claimed, “99 percent of what’s going on today wasn’t happening inside the facility. It was more at home, where these employees were going home and spreading some of the virus” by living too close together.\\n\\nOne sick worker, Michael Bul Gayo Gatluak, 22 and originally from South Sudan, says, “With how we work on the line, I would say I got sick because of them not taking safety measures.” His job is “really, really close” to other workers chopping fresh-killed pigs. “The job is so heavy. You have to breathe so hard.”\\n\\nIn early March, union officials requested masks, overcoats, entrance checking for fevers, and less crowding in 500-capacity cafeterias. But Smithfield waited on most safety measures until early April. Only April 6 did they start checking for fevers. Instead of protective masks, they gave out beard nets.\\n\\nSmithfield concealed infections with a policy of informing only employees whose work stations were in the same area as a person who tested positive. The fact that workers are required to move around was willfully ignored. One worker who tested positive said, “I clearly would have gotten it at the factory. This week I have worked on three different floors. I’ve eaten in two different cafeterias … I’ve been walking through the whole place.” Employees from the eighth floor of the plant were quarantined, but everyone else was told to keep working.\\n\\nWhat Is Really Going On?\\n\\nAverage plant wages are around $16 an hour. Smithfield never raised them. Instead, they offered $500 to employees who could go all of April without an unapproved day off. The company says their “Responsibility Bonuses” show their “immense gratefulness” to employees “for their selfless sacrifices.”\\n\\nMeanwhile, the local Argus Leader wrote union members wanted essential-worker hazard pay, which “would be considered hourly compensation about 1.5 or two times their normal pay.” One worker said, “I feel like they’re bribing us with [the bonus] to come to work sick. That’s how you know they don’t care.”\\n\\nBoth Sioux Falls workers killed by Covid-19 were in their sixties. It is unconscionable that they were still working. All meatpackers over 50 should be on paid leave. Agustín Rodríguez, 64, had a rough job sawing the legs off dead pigs. He mopped floors with a fever shortly before he was hospitalized.\\n\\nWhen CEO Kenneth Sullivan closed the plant, he claimed, “We have continued to run our facilities for one reason: to sustain our nation’s food supply.” This is an effort to sweep Smithfield’s abuses under the rug, as if the company were operating for public benefit. This patriotic propaganda that all Americans are in it together is like a drug to keep workers from getting organized.\\n\\nThe major union in the industry, including at Smithfield, is the United Food and Commercial Workers union (UFCW). What union leaders have done is ultimately troubling.\\n\\nCan Workers Fight?\\n\\nLocal AFL-CIO president Kooper Caraway has publicly said management delayed safety action as long as possible for profit. But while some workers were demanding a two-week shutdown, Caraway told the Argus Leader that was unrealistic because the government considers the plant essential. He suggested the union would be happy with minimal safety measures: “Even if 10 people get exposed in a day rather than 11. If you can implement a program where even one or two less people get exposed during a shift, that’s one or two less people.” Of course reducing infections is good, but suggesting workers would be satisfied if the company allowed 90% of the contagion to continue is horrifying.\\n\\nThe response of UFCW leadership was worse. As the disease was exploding, they told the Argus Leader, “We applaud [Smithfield’s] decision to temporarily close the plant [over Easter weekend] to push for an even safer work environment.” What does “even safer” mean in this context?\\n\\nThe union bureaucracy has taken weak action elsewhere. In Pennsylvania, the UFCW negotiated $2 hazard pay for two months with Cargill Meat — the same pandemic premium Amazon gave workers without a union. In Nebraska, the UFCW negotiated $4 hazard pay for one month with meat giant JBS.\\n\\nThe union has said nothing about forcing companies to send older workers home with pay, even though a 70-year-old shop steward and a 78-year-old grandfather working at JBS plants were killed by Covid-19. Smithfield workers were promised only two weeks of shutdown pay. For many, this compensation is half their normal paycheck because they routinely put in 66 hour weeks — overtime that costs exhaustion and chronic pain.\\n\\nUnion officials endeavor to cooperate with the meat companies. An Iowa UFCW president actually suggested it might be impossible for plants to move workers a full six feet apart and told the Des Moines Register, “We can’t stop the plants. If we stop the plants from running, we stop feeding the country. We want to do everything we can to make sure the employees are safe to keep the plant running.”\\n\\nEvery part of this explanation directly overlaps with what the Smithfield CEO said. Unfortunately, it amounts to accepting the company’s excuses.\\n\\nThey claim that workers who do hard physical labor, waking up at 4 a.m. and often working six days a week for years, would be guilty of taking food away from the people and hurting America if they dared to fight for their human needs. But nothing is said about the company raking in profits and even murdering workers to increase them.\\n\\nSmithfield’s parent company W.H. Group, which slaughters around 30 million pigs per year in plants in both the United States and China, saw its profits skyrocket by about one third in 2019 to $1.38 billion. It is disturbing that UFCW officials do not bring up these soaring profits in their response to the outbreaks. Reuters published a report on the corporation’s financial success in late March. The head of W.H. Group had touted to the media that it got through the pandemic in China with very limited impact on production.\\n\\nIt is true that many Smithfield workers are reasonably afraid for their jobs and want to keep working. A 25-year-old employee explained, “I have a lot of bills. My baby’s coming soon — I have to work.” At the same time, he was afraid of infecting his pregnant wife. His spouse, a former employee, said bitterly, “Smithfield— they don’t care about employees. They only care about their money.”\\n\\nWorkers are pressured in these two painful directions. Nonetheless, work can mean solidarity. Before Smithfield even checked temperatures, there was a “sick-out” strike without union support by 800 to 1,000 workers at a JBS meat factory in Colorado. Hundreds of workers also called in sick days at a Nebraska JBS plant.\\n\\nTrade union leaders won’t even whisper the word “strike” when thousands of workers are thinking about it. They are limiting themselves to polite requests. We need a workers’ movement that asks who controls the factory, that threatens to disrupt the bosses’ profits, and that allows workers to use their immense power — this could change the meat industry and the world.", "meta": {"src": "mine", "author": "xx"}}
+{"text": "Smithfield employs 3,700 people at its plants in Sioux Falls, South Dakota. The plant slaughters 19,500 pig a day — 5 percent of U.S. pork. Most of the workers are immigrants from Ethiopia, Mexico, South Sudan, Honduras, Myanmar, Somalia, Guatemala, and other poor countries.\\n\\nInevitably workers must pass within one foot of hundreds of colleagues in the hallways, locker rooms, cafeterias, and cutting lines. The same conditions have spurred Covid-19 outbreaks at meat plants from Minnesota and Wisconsin to Colorado, Nebraska, Missouri, Iowa, Pennsylvania, North Carolina, and Georgia.\\n\\n801 workers at the Sioux Falls plant have tested positive, together with 206 people close to them. The outbreak has killed Agustín Rodríguez Martínez, aged 64, an employee with two decades of experience originally from El Salvador, and Craig Allen Franken, 61, who worked for Smithfield his entire adult life.\\n\\nThe company knew of its first infection on March 24 or earlier. The virus spread exponentially for several weeks. Ahead of Easter Sunday and Monday (April 12-13), Smithfield promised to “completely shutter” to sanitize and put up cardboard and plastic sheet dividers. This would not end transmission, as potentially hundreds of staff were already carrying the virus. But even during this “shutdown,” many cars were seen in the parking lot. The mayor admits that the company lied, and the local AFL-CIO alleges the plant ran 60 percent production. On Easter, with 238 known infections, Smithfield finally agreed to shut down indefinitely after a request from the mayor and the governor. Yet the company insisted on waiting three more days to actually halt production.\\n\\nSmithfield denied contributing to the outbreak, saying it took a “very proactive approach.” Relying on racism, the company blamed workers for getting themselves sick. A spokesperson said the outbreak was so severe because of the plant’s “large immigrant population,” claming “Living circumstances in certain cultures are different than they are with your traditional American family.” They slandered the workers as dirty, ignorant, and untrustworthy with help from governor Kristi Noem, who claimed, “99 percent of what’s going on today wasn’t happening inside the facility. It was more at home, where these employees were going home and spreading some of the virus” by living too close together.\\n\\nOne sick worker, Michael Bul Gayo Gatluak, 22 and originally from South Sudan, says, “With how we work on the line, I would say I got sick because of them not taking safety measures.” His job is “really, really close” to other workers chopping fresh-killed pigs. “The job is so heavy. You have to breathe so hard.”\\n\\nIn early March, union officials requested masks, overcoats, entrance checking for fevers, and less crowding in 500-capacity cafeterias. But Smithfield waited on most safety measures until early April. Only April 6 did they start checking for fevers. Instead of protective masks, they gave out beard nets.\\n\\nSmithfield concealed infections with a policy of informing only employees whose work stations were in the same area as a person who tested positive. The fact that workers are required to move around was willfully ignored. One worker who tested positive said, “I clearly would have gotten it at the factory. This week I have worked on three different floors. I’ve eaten in two different cafeterias … I’ve been walking through the whole place.” Employees from the eighth floor of the plant were quarantined, but everyone else was told to keep working.\\n\\nWhat Is Really Going On?\\n\\nAverage plant wages are around $16 an hour. Smithfield never raised them. Instead, they offered $500 to employees who could go all of April without an unapproved day off. The company says their “Responsibility Bonuses” show their “immense gratefulness” to employees “for their selfless sacrifices.”\\n\\nMeanwhile, the local Argus Leader wrote union members wanted essential-worker hazard pay, which “would be considered hourly compensation about 1.5 or two times their normal pay.” One worker said, “I feel like they’re bribing us with [the bonus] to come to work sick. That’s how you know they don’t care.”\\n\\nBoth Sioux Falls workers killed by Covid-19 were in their sixties. It is unconscionable that they were still working. All meatpackers over 50 should be on paid leave. Agustín Rodríguez, 64, had a rough job sawing the legs off dead pigs. He mopped floors with a fever shortly before he was hospitalized.\\n\\nWhen CEO Kenneth Sullivan closed the plant, he claimed, “We have continued to run our facilities for one reason: to sustain our nation’s food supply.” This is an effort to sweep Smithfield’s abuses under the rug, as if the company were operating for public benefit. This patriotic propaganda that all Americans are in it together is like a drug to keep workers from getting organized.\\n\\nThe major union in the industry, including at Smithfield, is the United Food and Commercial Workers union (UFCW). What union leaders have done is ultimately troubling.\\n\\nCan Workers Fight?\\n\\nLocal AFL-CIO president Kooper Caraway has publicly said management delayed safety action as long as possible for profit. But while some workers were demanding a two-week shutdown, Caraway told the Argus Leader that was unrealistic because the government considers the plant essential. He suggested the union would be happy with minimal safety measures: “Even if 10 people get exposed in a day rather than 11. If you can implement a program where even one or two less people get exposed during a shift, that’s one or two less people.” Of course reducing infections is good, but suggesting workers would be satisfied if the company allowed 90% of the contagion to continue is horrifying.\\n\\nThe response of UFCW leadership was worse. As the disease was exploding, they told the Argus Leader, “We applaud [Smithfield’s] decision to temporarily close the plant [over Easter weekend] to push for an even safer work environment.” What does “even safer” mean in this context?\\n\\nThe union bureaucracy has taken weak action elsewhere. In Pennsylvania, the UFCW negotiated $2 hazard pay for two months with Cargill Meat — the same pandemic premium Amazon gave workers without a union. In Nebraska, the UFCW negotiated $4 hazard pay for one month with meat giant JBS.\\n\\nThe union has said nothing about forcing companies to send older workers home with pay, even though a 70-year-old shop steward and a 78-year-old grandfather working at JBS plants were killed by Covid-19. Smithfield workers were promised only two weeks of shutdown pay. For many, this compensation is half their normal paycheck because they routinely put in 66 hour weeks — overtime that costs exhaustion and chronic pain.\\n\\nUnion officials endeavor to cooperate with the meat companies. An Iowa UFCW president actually suggested it might be impossible for plants to move workers a full six feet apart and told the Des Moines Register, “We can’t stop the plants. If we stop the plants from running, we stop feeding the country. We want to do everything we can to make sure the employees are safe to keep the plant running.”\\n\\nEvery part of this explanation directly overlaps with what the Smithfield CEO said. Unfortunately, it amounts to accepting the company’s excuses.\\n\\nThey claim that workers who do hard physical labor, waking up at 4 a.m. and often working six days a week for years, would be guilty of taking food away from the people and hurting America if they dared to fight for their human needs. But nothing is said about the company raking in profits and even murdering workers to increase them.\\n\\nSmithfield’s parent company W.H. Group, which slaughters around 30 million pigs per year in plants in both the United States and China, saw its profits skyrocket by about one third in 2019 to $1.38 billion. It is disturbing that UFCW officials do not bring up these soaring profits in their response to the outbreaks. Reuters published a report on the corporation’s financial success in late March. The head of W.H. Group had touted to the media that it got through the pandemic in China with very limited impact on production.\\n\\nIt is true that many Smithfield workers are reasonably afraid for their jobs and want to keep working. A 25-year-old employee explained, “I have a lot of bills. My baby’s coming soon — I have to work.” At the same time, he was afraid of infecting his pregnant wife. His spouse, a former employee, said bitterly, “Smithfield— they don’t care about employees. They only care about their money.”\\n\\nWorkers are pressured in these two painful directions. Nonetheless, work can mean solidarity. Before Smithfield even checked temperatures, there was a “sick-out” strike without union support by 800 to 1,000 workers at a JBS meat factory in Colorado. Hundreds of workers also called in sick days at a Nebraska JBS plant.\\n\\nTrade union leaders won’t even whisper the word “strike” when thousands of workers are thinking about it. They are limiting themselves to polite requests. We need a workers’ movement that asks who controls the factory, that threatens to disrupt the bosses’ profits, and that allows workers to use their immense power — this could change the meat industry and the world.", "meta": {"src": "customized", "author": "x"}}
+{"text": "Smithfield employs 3,700 people at its plant in Sioux Falls, South Dakota. The plant slaughters 19,500 pigs a day — 5 percent of U.S. pork. Most of the workers are immigrants from Ethiopia, Mexico, South Sudan, Honduras, Myanmar, Somalia, Guatemala, and other poor countries.\\n\\nInevitably workers must pass within one foot of hundreds of colleagues in the hallways, locker rooms, cafeterias, and cutting lines. The same conditions have spurred Covid-19 outbreaks at meat plants from Minnesota and Wisconsin to Colorado, Nebraska, Missouri, Iowa, Pennsylvania, North Carolina, and Georgia.\\n\\n801 workers at the Sioux Falls plant have tested positive, together with 206 people close to them. The outbreak has killed Agustín Rodríguez Martínez, aged 64, an employee with two decades of experience originally from El Salvador, and Craig Allen Franken, 61, who worked for Smithfield his entire adult life.\\n\\nThe company knew of its first infection on March 24 or earlier. The virus spread exponentially for several weeks. Ahead of Easter Sunday and Monday (April 12-13), Smithfield promised to “completely shutter” to sanitize and put up cardboard and plastic sheet dividers. This would not end transmission, as potentially hundreds of staff were already carrying the virus. But even during this “shutdown,” many cars were seen in the parking lot. The mayor admits that the company lied, and the local AFL-CIO alleges the plant ran 60 percent production. On Easter, with 238 known infections, Smithfield finally agreed to shut down indefinitely after a request from the mayor and the governor. Yet the company insisted on waiting three more days to actually halt production.\\n\\nSmithfield denied contributing to the outbreak, saying it took a “very proactive approach.” Relying on racism, the company blamed workers for getting themselves sick. A spokesperson said the outbreak was so severe because of the plant’s “large immigrant population,” claming “Living circumstances in certain cultures are different than they are with your traditional American family.” They slandered the workers as dirty, ignorant, and untrustworthy with help from governor Kristi Noem, who claimed, “99 percent of what’s going on today wasn’t happening inside the facility. It was more at home, where these employees were going home and spreading some of the virus” by living too close together.\\n\\nOne sick worker, Michael Bul Gayo Gatluak, 22 and originally from South Sudan, says, “With how we work on the line, I would say I got sick because of them not taking safety measures.” His job is “really, really close” to other workers chopping fresh-killed pigs. “The job is so heavy. You have to breathe so hard.”\\n\\nIn early March, union officials requested masks, overcoats, entrance checking for fevers, and less crowding in 500-capacity cafeterias. But Smithfield waited on most safety measures until early April. Only April 6 did they start checking for fevers. Instead of protective masks, they gave out beard nets.\\n\\nSmithfield concealed infections with a policy of informing only employees whose work stations were in the same area as a person who tested positive. The fact that workers are required to move around was willfully ignored. One worker who tested positive said, “I clearly would have gotten it at the factory. This week I have worked on three different floors. I’ve eaten in two different cafeterias … I’ve been walking through the whole place.” Employees from the eighth floor of the plant were quarantined, but everyone else was told to keep working.\\n\\nWhat Is Really Going On?\\n\\nAverage plant wages are around $16 an hour. Smithfield never raised them. Instead, they offered $500 to employees who could go all of April without an unapproved day off. The company says their “Responsibility Bonuses” show their “immense gratefulness” to employees “for their selfless sacrifices.”\\n\\nMeanwhile, the local Argus Leader wrote union members wanted essential-worker hazard pay, which “would be considered hourly compensation about 1.5 or two times their normal pay.” One worker said, “I feel like they’re bribing us with [the bonus] to come to work sick. That’s how you know they don’t care.”\\n\\nBoth Sioux Falls workers killed by Covid-19 were in their sixties. It is unconscionable that they were still working. All meatpackers over 50 should be on paid leave. Agustín Rodríguez, 64, had a rough job sawing the legs off dead pigs. He mopped floors with a fever shortly before he was hospitalized.\\n\\nWhen CEO Kenneth Sullivan closed the plant, he claimed, “We have continued to run our facilities for one reason: to sustain our nation’s food supply.” This is an effort to sweep Smithfield’s abuses under the rug, as if the company were operating for public benefit. This patriotic propaganda that all Americans are in it together is like a drug to keep workers from getting organized.", "meta": {"src": "mine", "author": "xx"}}
+{"text": "Smithfield employs 3,700 people at its plants in Sioux Falls, South Dakota. The plant slaughters 19,500 pig a day — 5 percent of U.S. pork. Most of the workers are immigrants from Ethiopia, Mexico, South Sudan, Honduras, Myanmar, Somalia, Guatemala, and other poor countries.\\n\\nInevitably workers must pass within one foot of hundreds of colleagues in the hallways, locker rooms, cafeterias, and cutting lines. The same conditions have spurred Covid-19 outbreaks at meat plants from Minnesota and Wisconsin to Colorado, Nebraska, Missouri, Iowa, Pennsylvania, North Carolina, and Georgia.\\n\\n801 workers at the Sioux Falls plant have tested positive, together with 206 people close to them. The outbreak has killed Agustín Rodríguez Martínez, aged 64, an employee with two decades of experience originally from El Salvador, and Craig Allen Franken, 61, who worked for Smithfield his entire adult life.\\n\\nThe company knew of its first infection on March 24 or earlier. The virus spread exponentially for several weeks. Ahead of Easter Sunday and Monday (April 12-13), Smithfield promised to “completely shutter” to sanitize and put up cardboard and plastic sheet dividers. This would not end transmission, as potentially hundreds of staff were already carrying the virus. But even during this “shutdown,” many cars were seen in the parking lot. The mayor admits that the company lied, and the local AFL-CIO alleges the plant ran 60 percent production. On Easter, with 238 known infections, Smithfield finally agreed to shut down indefinitely after a request from the mayor and the governor. Yet the company insisted on waiting three more days to actually halt production.\\n\\nSmithfield denied contributing to the outbreak, saying it took a “very proactive approach.” Relying on racism, the company blamed workers for getting themselves sick. A spokesperson said the outbreak was so severe because of the plant’s “large immigrant population,” claming “Living circumstances in certain cultures are different than they are with your traditional American family.” They slandered the workers as dirty, ignorant, and untrustworthy with help from governor Kristi Noem, who claimed, “99 percent of what’s going on today wasn’t happening inside the facility. It was more at home, where these employees were going home and spreading some of the virus” by living too close together.\\n\\nOne sick worker, Michael Bul Gayo Gatluak, 22 and originally from South Sudan, says, “With how we work on the line, I would say I got sick because of them not taking safety measures.” His job is “really, really close” to other workers chopping fresh-killed pigs. “The job is so heavy. You have to breathe so hard.”\\n\\nIn early March, union officials requested masks, overcoats, entrance checking for fevers, and less crowding in 500-capacity cafeterias. But Smithfield waited on most safety measures until early April. Only April 6 did they start checking for fevers. Instead of protective masks, they gave out beard nets.\\n\\nSmithfield concealed infections with a policy of informing only employees whose work stations were in the same area as a person who tested positive. The fact that workers are required to move around was willfully ignored. One worker who tested positive said, “I clearly would have gotten it at the factory. This week I have worked on three different floors. I’ve eaten in two different cafeterias … I’ve been walking through the whole place.” Employees from the eighth floor of the plant were quarantined, but everyone else was told to keep working.\\n\\nWhat Is Really Going On?\\n\\nAverage plant wages are around $16 an hour. Smithfield never raised them. Instead, they offered $500 to employees who could go all of April without an unapproved day off. The company says their “Responsibility Bonuses” show their “immense gratefulness” to employees “for their selfless sacrifices.”\\n\\nMeanwhile, the local Argus Leader wrote union members wanted essential-worker hazard pay, which “would be considered hourly compensation about 1.5 or two times their normal pay.” One worker said, “I feel like they’re bribing us with [the bonus] to come to work sick. That’s how you know they don’t care.”\\n\\nBoth Sioux Falls workers killed by Covid-19 were in their sixties. It is unconscionable that they were still working. All meatpackers over 50 should be on paid leave. Agustín Rodríguez, 64, had a rough job sawing the legs off dead pigs. He mopped floors with a fever shortly before he was hospitalized.\\n\\nWhen CEO Kenneth Sullivan closed the plant, he claimed, “We have continued to run our facilities for one reason: to sustain our nation’s food supply.” This is an effort to sweep Smithfield’s abuses under the rug, as if the company were operating for public benefit. This patriotic propaganda that all Americans are in it together is like a drug to keep workers from getting organized.", "meta": {"src": "customized", "author": "x"}}
+{"text": "This paper proposed a novel method on LLM pretraining.", "meta": {"src": "customized", "author": "xxx"}}
+{"text": "第九届会议\\n2003年7月28日至8月8日\\n牙买加金斯敦\\n为来自发展中国家的法律和技术委员会以及财务委员会成员\\n参加委员会会议支付费用的方式\\n1. 国际海底管理局大会第八届会议请秘书长采取一项临时措施,设立一个自愿信托基金,以便支付来自发展中国家的法律和技术委员会成员以及来自发展中国家的财务委员会成员参加委员会会议的费用。\\n2. 由于秘书长向会员国发出为该信托基金捐款的请求,已收到三笔捐款,共计10 500美元。 管理局已为基金设立一个单独的账户。\\n3. 管理局第八届会议还决定,由财务委员会审查资助参加这两个委员会会议的方式,包括审查是否可能从管理局行政预算中提供经费。\\n4. 自愿信托基金迄今收到的捐款数额很小。 这两个委员会成员虽然由缔约国提名,但他们以个人身份当选。 因此,必须确保这些机构的成员在任期内能够参加会议并且持续不断地履行职务。 现已注意到,这两个委员会若干成员因旅费和生活津贴费用方面有困难而未能出席会议。 来自发展中国家成员参加会议的费用估计数见附件,其中比较了经济舱和公务舱机票价格以及适用于金斯敦的每日生活津贴费用。 从表中可以看出,根据不同的人数、机舱等级和会议持续时间,每年平均需要捐款120 000美元至215 000美元。\\n5. 为了指导委员会确定提供经费的方式,对某些国际组织的现行办法作了一次简要调查。 为支付参加会议的旅费和生活费而设立信托基金最相关的实例是2000年大会为来自发展中国家的大陆架界限委员会成员设立的自愿信托基金。 目前这一基金正在运作,但现有资源有限。 联合国制定的程序表明,委员会成员的政府应在规定时间内尽可能提前提出请求。 这种请求按照先到先核可的办法处理。 提供的机票将是最直接路线的经济舱机票,每日生活津贴将按照联合国费率提供。 购买机票的所有安排均由联合国秘书处执行。\\n6. 虽然已经设立了临时性的自愿信托基金,但是,对该基金的捐款数额很小,捐款速度很慢。 因此,除了对信托基金提供自愿捐款的办法之外,建议委员会还可以考虑采用下列办法:\\n(a) 从管理局一般行政经费累计利息中拨出一定数额的经费;\\n(b) 每年从上一年预算未动用部分中拨出规定的数额;\\n(c) 从先驱投资者基金利息中拨出规定的数额。\\n7. 委员会还不妨建议由管理局秘书处依照行政规则和程序管理该基金,并向财务委员会提出一份报告。\\n附件\\n资助来自发展中国家的法律和技术委员会以及财务\\n委员会成员出席会议的指示性费用(美元)\\n成员\\n机票\\n机场\\n费用\\n金斯敦每日生活\\n津贴\\n转机途中每日生活\\n7日\\n共计\\n14日\\n经济舱\\n公务舱\\n7天=(8天每日生活\\n津贴)\\n14天= (15天每日生活津贴)\\n商务舱\\n法律和技术委员会\\n印度尼西亚\\n(纽约)\\n黎巴嫩\\n巴基斯坦\\n阿根廷\\n喀麦隆\\n墨西哥\\n巴西\\n塞内加尔\\n莫桑比克\\n埃及(纽约)\\n大韩民国\\n印度\\n斐济\\n智利\\n中国\\n纳米比亚\\n小计\\n财务委员会\\n缅甸\\n乌干达\\n牙买加\\n印度(纽约)\\n尼日利亚\\n总计\\n注:估计费用表表明每年资助每个机构一次会议需要经费120 000美元至215 000美元(四舍五入)。", "meta": {"src": "wiki", "version": "0.1", "author": "xyz"}}
+{"text": "第九届会议\\n时间:2003年7月28日至8月8日\\n牙买加金斯敦\\n为来自发展中国家的法律和技术委员会以及财务委员会成员\\n参加委员会会议支付费用的方式\\n1. 国际海底管理局大会第八届会议请秘书长采取一项临时措施,设立一个自愿信托基金,以便支付来自发展中国家的法律和技术委员会成员以及来自发展中国家的财务委员会成员参加委员会会议的费用。\\n2. 由于秘书长向会员国发出为该信托基金捐款的请求,已收到三笔捐款,共计10 500美元。 管理局已为基金设立一个单独的账户。\\n3. 管理局第八届会议还决定,由财务委员会审查资助参加这两个委员会会议的方式,包括审查是否可能从管理局行政预算中提供经费。\\n4. 自愿信托基金迄今收到的捐款数额很小。 这两个委员会成员虽然由缔约国提名,但他们以个人身份当选。 因此,必须确保这些机构的成员在任期内能够参加会议并且持续不断地履行职务。 现已注意到,这两个委员会若干成员因旅费和生活津贴费用方面有困难而未能出席会议。 来自发展中国家成员参加会议的费用估计数见附件,其中比较了经济舱和公务舱机票价格以及适用于金斯敦的每日生活津贴费用。 从表中可以看出,根据不同的人数、机舱等级和会议持续时间,每年平均需要捐款120 000美元至215 000美元。\\n5. 为了指导委员会确定提供经费的方式,对某些国际组织的现行办法作了一次简要调查。 为支付参加会议的旅费和生活费而设立信托基金最相关的实例是2000年大会为来自发展中国家的大陆架界限委员会成员设立的自愿信托基金。 目前这一基金正在运作,但现有资源有限。 联合国制定的程序表明,委员会成员的政府应在规定时间内尽可能提前提出请求。 这种请求按照先到先核可的办法处理。 提供的机票将是最直接路线的经济舱机票,每日生活津贴将按照联合国费率提供。 购买机票的所有安排均由联合国秘书处执行。\\n6. 虽然已经设立了临时性的自愿信托基金,但是,对该基金的捐款数额很小,捐款速度很慢。 因此,除了对信托基金提供自愿捐款的办法之外,建议委员会还可以考虑采用下列办法:\\n(a) 从管理局一般行政经费累计利息中拨出一定数额的经费;\\n(b) 每年从上一年预算未动用部分中拨出规定的数额;\\n(c) 从先驱投资者基金利息中拨出规定的数额。\\n7. 委员会还不妨建议由管理局秘书处依照行政规则和程序管理该基金,并向财务委员会提出一份报告。\\n附件\\n资助来自发展中国家的法律和技术委员会以及财务\\n委员会成员出席会议的指示性费用(美元)\\n成员\\n机票\\n机场\\n费用\\n金斯敦每日生活\\n津贴\\n转机途中每日生活\\n7日\\n共计\\n14日\\n经济舱\\n公务舱\\n7天=(8天每日生活\\n津贴)\\n14天= (15天每日生活津贴)\\n商务舱\\n法律和技术委员会\\n印度尼西亚\\n(纽约)\\n黎巴嫩\\n巴基斯坦\\n阿根廷\\n喀麦隆\\n墨西哥\\n巴西\\n塞内加尔\\n莫桑比克\\n埃及(纽约)\\n大韩民国\\n印度\\n斐济\\n智利\\n中国\\n纳米比亚\\n小计\\n财务委员会\\n缅甸\\n乌干达\\n牙买加\\n印度(纽约)\\n尼日利亚\\n总计\\n注:估计费用表表明每年资助每个机构一次会议需要经费120 000美元至215 000美元(四舍五入)。", "meta": {"src": "wiki", "version": "0.1", "author": "xy"}}
diff --git a/demos/data/demo-dataset.jsonl b/demos/data/demo-dataset.jsonl
new file mode 100644
index 000000000..4d8cdadfd
--- /dev/null
+++ b/demos/data/demo-dataset.jsonl
@@ -0,0 +1,6 @@
+{"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}}
+{"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}}
+{"text": "你好,请问你是谁", "meta": {"src": "customized", "author": "xxx"}}
+{"text": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "meta": {"src": "Oscar", "version": "2.0"}}
+{"text": "欢迎来到阿里巴巴!", "meta": {"src": "customized", "version": "0.1", "author": "xxx"}}
+{"text": "This paper proposed a novel method on LLM pretraining.", "meta": {"src": "customized", "author": "xxx"}}
diff --git a/demos/data_visualization_diversity/.DS_Store b/demos/data_visualization_diversity/.DS_Store
new file mode 100644
index 000000000..e4ae7825a
Binary files /dev/null and b/demos/data_visualization_diversity/.DS_Store differ
diff --git a/demos/data_visualization_diversity/app.py b/demos/data_visualization_diversity/app.py
new file mode 100644
index 000000000..acf4596f5
--- /dev/null
+++ b/demos/data_visualization_diversity/app.py
@@ -0,0 +1,236 @@
+import os
+
+import plotly.express as px
+import streamlit as st
+import yaml
+from loguru import logger
+
+from data_juicer.analysis.diversity_analysis import (DiversityAnalysis,
+ get_diversity,
+ prepare_diversity_model)
+from data_juicer.config import init_configs
+from data_juicer.core import Analyser
+from data_juicer.ops.base_op import OPERATORS
+
+
+def convert_csv(df):
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
+ return df.to_csv().encode('utf-8')
+
+
+@st.cache_data
+def convert_jsonl(df):
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
+ return df.to_json(orient='records', lines=True).encode('utf-8')
+
+
+@st.cache_data
+def get_diversity_model(lang):
+ diversity_model = prepare_diversity_model(lang)
+ return diversity_model
+
+
+@st.cache_data
+def postproc_diversity(dataframe, **kwargs):
+ df = get_diversity(dataframe, **kwargs)
+ return df
+
+
+def pretty_out(d):
+ res = ''
+ process = ''
+ op_names = set(OPERATORS.modules.keys())
+ for key, value in d.items():
+ if key == 'process':
+ process = yaml.dump(value,
+ allow_unicode=True,
+ default_flow_style=False)
+ elif key == 'config' or key.split('.')[0] in op_names:
+ continue
+ else:
+ res += f'{key}:\n \t {value}\n'
+ res += 'process:\n' + \
+ '\n'.join(['\t' + line for line in process.splitlines()])
+
+ return res
+
+
+def parse_cfg():
+
+ cfg_cmd = '--config configs/demo.yaml'
+
+ args_in_cmd = cfg_cmd.split()
+
+ if len(args_in_cmd) >= 2 and args_in_cmd[0] == '--config':
+ cfg_f_name = args_in_cmd[1]
+ else:
+ st.warning('Please specify a config command or upload a config file.')
+ st.stop()
+
+ if not os.path.exists(cfg_f_name):
+ st.warning('do not parse'
+ f'config file does not exist with cfg_f_name={cfg_f_name}')
+ st.stop()
+
+ with open(cfg_f_name, 'r') as cfg_f:
+ specified_cfg = yaml.safe_load(cfg_f)
+
+ try:
+ parsed_cfg = init_configs(args=args_in_cmd)
+ st.session_state.cfg = parsed_cfg
+
+ return pretty_out(parsed_cfg), pretty_out(specified_cfg), parsed_cfg
+ except Exception as e:
+ return str(e), pretty_out(specified_cfg), None
+
+
+def load_dataset(dataset_file):
+
+ cfg = st.session_state.get('cfg', parse_cfg()[2])
+ if cfg is None:
+ raise ValueError('you have not specify valid cfg')
+ # force generating separate figures
+ cfg['save_stats_in_one_file'] = True
+
+ del_file = False
+ if dataset_file is not None:
+
+ file_contents = dataset_file.getvalue()
+ with open(dataset_file.name, 'wb') as f:
+ f.write(file_contents)
+ cfg.dataset_path = dataset_file.name
+ del_file = True
+
+ logger.info('=========Stage: analyze original data=========')
+ analyzer = Analyser(cfg)
+
+ dataset = analyzer.formatter.load_dataset()
+ if del_file:
+ os.remove(dataset_file.name)
+ return dataset
+
+
+class Visualize:
+
+ @staticmethod
+ def setup():
+ st.set_page_config(
+ page_title='Juicer',
+ page_icon=':smile',
+ layout='wide',
+ # initial_sidebar_state="expanded",
+ )
+
+ readme_link = 'https://code.alibaba-inc.com/DAIL-DATA/' \
+ 'data_juicer/blob/master/README.md'
+ st.markdown(
+ '
Data-Juicer \
+
',
+ unsafe_allow_html=True,
+ )
+ st.markdown(
+ f'
A Data-Centric Text Processing System for \
+ Large Language Models, \
+ see more details in Document
',
+ unsafe_allow_html=True,
+ )
+
+ @staticmethod
+ def draw_sunburst(df, path, values):
+
+ fig = px.sunburst(df, path=path, values=values)
+ fig.update_layout(margin=dict(l=0, r=0, t=0, b=0),
+ font_family='Times New Roman',
+ font=dict(size=40))
+ st.plotly_chart(fig, use_container_width=True)
+
+ @staticmethod
+ def diversity():
+ col1, col2 = st.columns(2)
+ with col1:
+ dataset_file = st.file_uploader(
+ label='Upload you custom dataset(jsonl/csv)',
+ type=['json', 'jsonl', 'csv'])
+
+ with col2:
+ st.text_area(label='Default Demo dataset',
+ disabled=True,
+ value='data/demo-dataset.jsonl')
+
+ with st.expander('Set diversity params', expanded=True):
+
+ col1, col2, col3, col4 = st.columns(4)
+ with col1:
+ label = 'Which language of your dataset'
+ options = ['en', 'zh']
+ lang_select = st.selectbox(
+ label=label,
+ options=options,
+ )
+ with col2:
+ top_k_verbs = st.number_input('Set the top_k of verbs',
+ value=20)
+ with col3:
+ top_k_nouns = st.number_input('Set the top_k of nouns',
+ value=4)
+ with col4:
+ threshold = st.slider('Count threshold',
+ min_value=0,
+ value=0,
+ max_value=100,
+ step=1)
+ diversity_btn = st.button('Start to analyse Verb-Noun diversity',
+ use_container_width=True)
+
+ with st.expander('Diversity Results ', expanded=True):
+
+ cfg = st.session_state.get('cfg', parse_cfg()[2])
+ output_path = os.path.join(os.path.dirname(cfg.export_path),
+ 'analysis')
+ raw_df = None
+ if diversity_btn:
+ try:
+ with st.spinner('Wait for analyze diversity...'):
+ dataset = load_dataset(dataset_file)
+
+ diversity_analysis = DiversityAnalysis(
+ dataset, output_path)
+
+ raw_df = diversity_analysis.compute(
+ lang_or_model=get_diversity_model(lang_select))
+
+ st.session_state[f'diversity{lang_select}'] = raw_df
+
+ except Exception as e:
+ st.warning(f'Error {str(e)} in {lang_select}')
+ else:
+ raw_df = st.session_state.get(f'diversity{lang_select}', None)
+
+ if raw_df is not None:
+ df = postproc_diversity(raw_df,
+ top_k_verbs=top_k_verbs,
+ top_k_nouns=top_k_nouns)
+ df = df[df['count'] >= threshold]
+ Visualize.draw_sunburst(df,
+ path=['verb', 'noun'],
+ values='count')
+
+ st.download_button(
+ label='Download diversity data as CSV',
+ data=convert_csv(df),
+ file_name='diversity.csv',
+ mime='text/csv',
+ )
+
+ @staticmethod
+ def visualize():
+ Visualize.setup()
+ Visualize.diversity()
+
+
+def main():
+ Visualize.visualize()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/demos/data_visualization_diversity/configs/demo.yaml b/demos/data_visualization_diversity/configs/demo.yaml
new file mode 100644
index 000000000..d71266901
--- /dev/null
+++ b/demos/data_visualization_diversity/configs/demo.yaml
@@ -0,0 +1,66 @@
+# Process config example for dataset
+
+# global parameters
+project_name: 'demo'
+dataset_path: './data/demo-dataset.jsonl' # path to your dataset directory or file
+np: 1 # number of subprocess to process your dataset
+
+export_path: './outputs/demo/demo-processed.jsonl'
+
+# process schedule
+# a list of several process operators with their arguments
+process:
+ # Filter ops
+ - alphanumeric_filter: # filter text with alphabet/numeric ratio out of specific range.
+ tokenization: false # Whether to count the ratio of alphanumeric to the total number of tokens.
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.9 # the max ratio of filter range
+ - average_line_length_filter: # filter text with the average length of lines out of specific range.
+ min_len: 10 # the min length of filter range
+ max_len: 10000 # the max length of filter range
+ - character_repetition_filter: # filter text with the character repetition ratio out of specific range
+ rep_len: 10 # repetition length for char-level n-gram
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.5 # the max ratio of filter range
+ - flagged_words_filter: # filter text with the flagged-word ratio larger than a specific max value
+ lang: en # consider flagged words in what language
+ tokenization: false # whether to use model to tokenize documents
+ max_ratio: 0.0045 # the max ratio to filter text
+ flagged_words_dir: ./assets # directory to store flagged words dictionaries
+ use_words_aug: false # whether to augment words, especially for Chinese and Vietnamese
+ words_aug_group_sizes: [2] # the group size of words to augment
+ words_aug_join_char: "" # the join char between words to augment
+ - language_id_score_filter: # filter text in specific language with language scores larger than a specific max value
+ lang: en # keep text in what language
+ min_score: 0.8 # the min language scores to filter text
+ - maximum_line_length_filter: # filter text with the maximum length of lines out of specific range
+ min_len: 10 # the min length of filter range
+ max_len: 10000 # the max length of filter range
+ - perplexity_filter: # filter text with perplexity score out of specific range
+ lang: en # compute perplexity in what language
+ max_ppl: 1500 # the max perplexity score to filter text
+ - special_characters_filter: # filter text with special-char ratio out of specific range
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.25 # the max ratio of filter range
+ - stopwords_filter: # filter text with stopword ratio smaller than a specific min value
+ lang: en # consider stopwords in what language
+ tokenization: false # whether to use model to tokenize documents
+ min_ratio: 0.3 # the min ratio to filter text
+ stopwords_dir: ./assets # directory to store stopwords dictionaries
+ use_words_aug: false # whether to augment words, especially for Chinese and Vietnamese
+ words_aug_group_sizes: [2] # the group size of words to augment
+ words_aug_join_char: "" # the join char between words to augment
+ - text_length_filter: # filter text with length out of specific range
+ min_len: 10 # the min length of filter range
+ max_len: 10000 # the max length of filter range
+ - words_num_filter: # filter text with number of words out of specific range
+ lang: en # sample in which language
+ tokenization: false # whether to use model to tokenize documents
+ min_num: 10 # the min number of filter range
+ max_num: 10000 # the max number of filter range
+ - word_repetition_filter: # filter text with the word repetition ratio out of specific range
+ lang: en # sample in which language
+ tokenization: false # whether to use model to tokenize documents
+ rep_len: 10 # repetition length for word-level n-gram
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.5 # the max ratio of filter range
diff --git a/demos/data_visualization_diversity/data/demo-dataset.jsonl b/demos/data_visualization_diversity/data/demo-dataset.jsonl
new file mode 100644
index 000000000..3f9e7640b
--- /dev/null
+++ b/demos/data_visualization_diversity/data/demo-dataset.jsonl
@@ -0,0 +1,147 @@
+[
+ {
+ "text": "Give three tips for staying healthy.",
+ "input": "",
+ "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."
+ },
+ {
+ "text": "What are the three primary colors?",
+ "input": "",
+ "output": "The three primary colors are red, blue, and yellow."
+ },
+ {
+ "text": "Describe the structure of an atom.",
+ "input": "",
+ "output": "An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom."
+ },
+ {
+ "text": "How can we reduce air pollution?",
+ "input": "",
+ "output": "There are a number of ways to reduce air pollution, such as shifting to renewable energy sources, encouraging the use of public transportation, prohibiting the burning of fossil fuels, implementing policies to reduce emissions from industrial sources, and implementing vehicle emissions standards. Additionally, individuals can do their part to reduce air pollution by reducing car use, avoiding burning materials such as wood, and changing to energy efficient appliances."
+ },
+ {
+ "text": "Describe a time when you had to make a difficult decision.",
+ "input": "",
+ "output": "I had to make a difficult decision when I was working as a project manager at a construction company. I was in charge of a project that needed to be completed by a certain date in order to meet the client\u2019s expectations. However, due to unexpected delays, we were not able to meet the deadline and so I had to make a difficult decision. I decided to extend the deadline, but I had to stretch the team\u2019s resources even further and increase the budget. Although it was a risky decision, I ultimately decided to go ahead with it to ensure that the project was completed on time and that the client\u2019s expectations were met. The project was eventually successfully completed and this was seen as a testament to my leadership and decision-making abilities."
+ },
+ {
+ "text": "Identify the odd one out.",
+ "input": "Twitter, Instagram, Telegram",
+ "output": "Telegram"
+ },
+ {
+ "text": "Explain why the following fraction is equivalent to 1/4",
+ "input": "4/16",
+ "output": "The fraction 4/16 is equivalent to 1/4 because both numerators and denominators are divisible by 4. Dividing both the top and bottom numbers by 4 yields the fraction 1/4."
+ },
+ {
+ "text": "Write a short story in third person narration about a protagonist who has to make an important career decision.",
+ "input": "",
+ "output": "John was at a crossroads in his life. He had just graduated college and was now facing the big decision of what career to pursue. After much deliberation, he decided that he wanted to be an accountant and help the financially disadvantaged. He had always been good with numbers and enjoyed seeing the tangible results of his work. \n\nJohn enrolled in accounting courses and initially found it quite challenging. He had to learn multiple systems and regulations quickly, but he worked hard and eventually excelled in his studies. After a few years, John started working at an accounting firm in his city. He was eager to put his knowledge of taxes and accounting to use in a real-world setting.\n\nJohn loved his job, as it let him express his creativity in finding strategies to save his clients money. After a few years at the firm, he became a senior accountant and was asked to manage bigger and more challenging cases. He was now a respected figure in the financial industry, but he still remembers when he was just a recent college graduate, unsure of the direction in which his life would take him."
+ },
+ {
+ "text": "Render a 3D model of a house",
+ "input": "",
+ "output": " This type of text cannot be fulfilled by a GPT model."
+ },
+ {
+ "text": "Evaluate this sentence for spelling and grammar mistakes",
+ "input": "He finnished his meal and left the resturant",
+ "output": "He finished his meal and left the restaurant."
+ },
+ {
+ "text": "How did Julius Caesar die?",
+ "input": "",
+ "output": "Julius Caesar was assassinated by a group of up to 60 conspirators, led by Gaius Cassius Longinus and Marcus Junius Brutus, in the Senate House on the Ides of March (15 March) of 44 BC."
+ },
+ {
+ "text": "What is the capital of France?",
+ "input": "",
+ "output": "The capital of France is Paris."
+ },
+ {
+ "text": "Generate a list of ten items a person might need for a camping trip",
+ "input": "",
+ "output": "1. Tent\n2. Sleeping bags\n3. Flashlight\n4. Matches/lighter\n5. Insect repellent\n6. Extra clothes\n7. First aid kit\n8. Camp stove\n9. Cooking utensils\n10. Biodegradable soaps/shampoos"
+ },
+ {
+ "text": "Discuss the causes of the Great Depression",
+ "input": "",
+ "output": "The Great Depression was a severe global economic downturn that lasted from 1929 to 1939. It was caused by several factors, including the stock market crash of 1929, structural weaknesses in the economy, low consumer spending, high unemployment, and a lack of confidence in the banking system. Other contributing factors included overproduction of commodities, reliance on borrowing to finance investments, agricultural overproduction, and a decrease in international trade."
+ },
+ {
+ "text": "Classify the following into animals, plants, and minerals",
+ "input": "Oak tree, copper ore, elephant",
+ "output": "Oak tree: Plant\n Copper ore: Mineral\n Elephant: Animal"
+ },
+ {
+ "text": "Explain the use of word embeddings in Natural Language Processing",
+ "input": "",
+ "output": "Word embeddings are one of the most powerful tools available for Natural Language Processing (NLP). They are mathematical representations of words or phrases in a vector space, allowing similarities between words and the context in which they are used to be measured. Word embeddings are useful for tasks such as sentiment analysis, text classification, predicting the next word in a sequence, and understanding synonyms and analogies. They allow for words to be processed as numerical values, giving machines an easier way to perform NLP tasks."
+ },
+ {
+ "text": "Describe the function of a computer motherboard",
+ "input": "",
+ "output": "A computer motherboard is the main circuit board in a computer. It is the backbone of the system, connecting all other components such as the processor, RAM, storage, graphics card and peripheral ports. It provides communication pathways between all of these components, allowing them to interact and work together. The motherboard is responsible for regulating power, and also houses slots for expansion cards, as well as ports for USB, audio, Ethernet, and other connection types."
+ },
+ {
+ "text": "Reverse engineer this code to create a new version",
+ "input": "def factorialize(num):\n factorial = 1\n for i in range(1, num):\n factorial *= i\n \n return factorial",
+ "output": "def factorialize(num):\n factorial = 1\n for i in range(num, 0, -1):\n factorial *= i\n \n return factorial"
+ },
+ {
+ "text": "Propose an ethical solution to the problem of data privacy",
+ "input": "",
+ "output": "One ethical solution to the problem of data privacy is to implement an opt-in data collection policy. This means that companies cannot collect and use personal data without the explicit permission of the data owners. This approach gives users the power to determine which data companies can use and for what purpose. Companies should also be transparent about how they use the collected data, so that users can make informed decisions and have control over their own data."
+ },
+ {
+ "text": "Generate three verbs that mean the same as \"to apologize\"",
+ "input": "",
+ "output": "Acknowledge, express regret, make amends."
+ },
+ {
+ "text": "What does DNA stand for?",
+ "input": "",
+ "output": "DNA stands for deoxyribonucleic acid."
+ },
+ {
+ "text": "Compare and contrast the Cuban Missile Crisis and the Vietnam War.",
+ "input": "",
+ "output": "The Cuban Missile Crisis and the Vietnam War were two of the largest and deadliest conflicts of the 20th century. The Cuban Missile Crisis involved a standoff between the United States and the Soviet Union over Soviet nuclear missile bases in Cuba, while the Vietnam War was a prolonged military conflict in Indochina between the North, which was backed by the Soviet Union and the South, which was backed by the United States. The Cuban Missile Crisis unfolded over a period of 13 days in October 1962, with the resolution coming in the form of a diplomatic agreement. The Vietnam War lasted for 19 years, beginning in 1954 and ending in 1973, with the signing of the Paris Peace Accords. While both of these conflicts were Cold War related, they were also very different in terms of causes, objectives, strategic objectives and outcomes. The Cuban Missile Crisis did not result in any direct military conflict and was resolved primarily through diplomacy and the decision to withdraw nuclear weapons from Cuba. On the other hand, the Vietnam War was a prolonged series of battles resulting in significant losses for both sides. The Vietnam War also had a significant impact on the United States, as it highlighted the realities of a prolonged conflict and the limits of U.S. military intervention. Additionally, the Vietnam War also drastically changed the way public opinion viewed the military and the use of force."
+ },
+ {
+ "text": "Generate a list of random words.",
+ "input": "",
+ "output": "sneeze, conflict, ancestor, thunder, companion, amulet, forge, serpent, magnify, jester, cascade, universe, surge, nausea, oasis, anomaly, pursue, rapture, pagoda, mosaic."
+ },
+ {
+ "text": "Transcribe the recording into text.",
+ "input": "Recording about the history of Waterloo",
+ "output": "The Battle of Waterloo, fought on June 18, 1815, was the last major battle of the Napoleonic Wars. It marked the end of Napoleon Bonaparte's rule. The battle was fought between Napoleon Bonaparte's French forces and an allied force of British, Dutch, Belgian and Prussian troops, led by the Duke of Wellington. It was fought near the town of Waterloo in the former Austrian Netherlands, now present-day Belgium. The battle was a defeat for Napoleon's forces and ended Napoleon's rule over France. Napoleon himself was exiled to St. Helena, where he would die in 1821. The battle marked the end of French domination of Europe and the beginning of a period of peace in the region. It was also the last major battle of the Napoleonic Wars and is remembered as one of the most important battles in history. The victory of the allied forces at Waterloo helped to bring an end to French imperial ambitions in Europe. It also established the British as one of the most powerful nations in Europe. The battle was fought with a combination of infantry, cavalry, and artillery tactics, and showed the beginning of the changing nature of warfare. Although the French forces greatly outnumbered the allies, their strategic and tactical mistake led to the loss of the battle. This defeat signaled the end of French imperial power in Europe. The legacy of Waterloo still stands today, and it cemented the British Empire's position for the next one hundred years."
+ },
+ {
+ "text": "Who is the world's most famous painter?",
+ "input": "",
+ "output": "The world's most famous painter is generally considered to be the Italian renaissance artist Leonardo da Vinci. His work includes the Mona Lisa and The Last Supper, both of which are considered masterpieces. Da Vinci was also a master of science, inventing the helicopter and the tank, among other inventions. He was also a prolific musician, sculptor, and architect. His work has inspired generations of artists and continues to be revered today."
+ },
+ {
+ "text": "Explain the concept of cogging torque.",
+ "input": "",
+ "output": "Cogging torque is the torque that is applied to a rotating device when the device experiences resistance from the teeth of the gears that are connected to the device. Cogging torque occurs due to the reaction force generated when the teeth of the gear drive mesh. This force pushes against each other, resulting in a drag force that is transmitted from the gears to the shaft. The magnitude of the cogging torque can vary depending on the size and number of teeth, as well as the gear's material and design. Cogging torque is undesirable in most motorized applications, as it can lead to loss of efficiency. To reduce cogging torque, gear designs are often symmetrical or have certain tooth profiles which reduce the force of contact between the teeth when they come into contact with each other."
+ },
+ {
+ "text": "Look up the boiling point of water.",
+ "input": "",
+ "output": "The boiling point of water is 100\u00b0C (212\u00b0F), at standard atmospheric pressure. When heated at higher pressures, water's boiling point increases. For example, water will boil at a temperature of 115\u00b0C (239\u00b0F) at an atmospheric pressure of 1 bar. At higher pressures, such as those experienced at the bottom of the ocean, the boiling point of water can be as high as 374\u00b0C (705\u00b0F)."
+ },
+ {
+ "text": "Describe the life and reign of King Charles II.",
+ "input": "",
+ "output": "King Charles II ascended to the throne of England following the English Civil War and the death of his father, Charles I. He was crowned in 1661 and his reign lasted until his death in 1685. Charles II was also a patron of the arts, working with some of the greatest painters and sculptors of his day. He also encouraged theater and sports and had an interest in sciences such as astronomy and natural history. Charles II was a moderate ruler who respected the civil and religious liberties of his citizens. He was tolerant of other religions and gave Jews and Catholics more rights in the kingdom. Charles II was also a strong supporter of the Royal Navy, which saw England increase its power in the world. He also promoted the development of industry and commerce, which helped to modernize the country. Charles II's reign was an important step in restoring the monarchy of England after the turmoil of the English Civil War."
+ },
+ {
+ "text": "Find the area of a circle given its radius.",
+ "input": "Radius = 4",
+ "output": "The area of a circle with a radius of 4 is equal to 12.5664 square units. This is calculated by using the formula A = \u03c0r2, where A is the area, \u03c0 is roughly equal to 3.1416 and r is the radius of the circle."
+ }
+]
\ No newline at end of file
diff --git a/demos/data_visualization_op_effect/.DS_Store b/demos/data_visualization_op_effect/.DS_Store
new file mode 100644
index 000000000..02a008699
Binary files /dev/null and b/demos/data_visualization_op_effect/.DS_Store differ
diff --git a/demos/data_visualization_op_effect/app.py b/demos/data_visualization_op_effect/app.py
new file mode 100644
index 000000000..2d075444f
--- /dev/null
+++ b/demos/data_visualization_op_effect/app.py
@@ -0,0 +1,526 @@
+# Some code here has been modified from:
+# https://huggingface.co/spaces/huggingface/text-data-filtering
+# --------------------------------------------------------
+import copy
+import math
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import streamlit as st
+import yaml
+
+from data_juicer.config import init_configs
+from data_juicer.core import Analyser
+from data_juicer.ops.base_op import OPERATORS
+
+
+@st.cache_data
+def convert_csv(df):
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
+ return df.to_csv().encode('utf-8')
+
+
+@st.cache_data
+def convert_jsonl(df):
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
+ return df.to_json(orient='records', lines=True).encode('utf-8')
+
+
+def pretty_out(d):
+ res = ''
+ process = ''
+ op_names = set(OPERATORS.modules.keys())
+ for key, value in d.items():
+ if key == 'process':
+ process = yaml.dump(value,
+ allow_unicode=True,
+ default_flow_style=False)
+ elif key == 'config' or key.split('.')[0] in op_names:
+ continue
+ else:
+ res += f'{key}:\n \t {value}\n'
+ res += 'process:\n' + \
+ '\n'.join(['\t' + line for line in process.splitlines()])
+
+ return res
+
+
+def parse_cfg():
+
+ lang_select = st.session_state.get('lang_select', 'en')
+
+ if lang_select == 'zh':
+ cfg_cmd = '--config configs/demo_zh.yaml'
+ else:
+ cfg_cmd = '--config configs/demo_en.yaml'
+
+ args_in_cmd = cfg_cmd.split()
+
+ if len(args_in_cmd) >= 2 and args_in_cmd[0] == '--config':
+ cfg_f_name = args_in_cmd[1]
+ else:
+ st.warning('Please specify a config command or upload a config file.')
+ st.stop()
+
+ if not os.path.exists(cfg_f_name):
+ st.warning('do not parse'
+ f'config file does not exist with cfg_f_name={cfg_f_name}')
+ st.stop()
+
+ with open(cfg_f_name, 'r') as cfg_f:
+ specified_cfg = yaml.safe_load(cfg_f)
+
+ try:
+ parsed_cfg = init_configs(args=args_in_cmd)
+ st.session_state.cfg = parsed_cfg
+
+ return pretty_out(parsed_cfg), pretty_out(specified_cfg), parsed_cfg
+ except Exception as e:
+ return str(e), pretty_out(specified_cfg), None
+
+
+def analyze_and_show_res(dataset_file):
+ images_ori = []
+ cfg = st.session_state.get('cfg', parse_cfg()[2])
+ if cfg is None:
+ raise ValueError('you have not specify valid cfg')
+ # force generating separate figures
+ cfg['save_stats_in_one_file'] = True
+
+ del_file = False
+ if dataset_file is not None:
+
+ file_contents = dataset_file.getvalue()
+ with open(dataset_file.name, 'wb') as f:
+ f.write(file_contents)
+ cfg.dataset_path = dataset_file.name
+ del_file = True
+
+ analyzer = Analyser(cfg)
+ dataset = analyzer.run()
+
+ analysis_res_ori = pd.read_csv(
+ os.path.join(analyzer.analysis_path, 'overall.csv'))
+ for f_path in os.listdir(analyzer.analysis_path):
+ if '.png' in f_path and 'all-stats' in f_path:
+ images_ori.append(os.path.join(analyzer.analysis_path, f_path))
+
+ st.session_state.dataset = dataset
+ st.session_state.original_overall = analysis_res_ori
+ st.session_state.original_imgs = images_ori
+
+ if del_file:
+ os.remove(dataset_file.name)
+
+
+def get_min_max_step(data):
+ max_value = np.max(data)
+ if max_value > 2.0:
+ min_value = 0
+ max_value = int(max_value + 1)
+ step = 1
+ else:
+ min_value = 0.0
+ max_value = max(1.0, max_value)
+ step = 0.01
+ return min_value, max_value, step
+
+
+op_stats_dict = {
+ 'alphanumeric_filter': ['alpha_token_ratio', 'alnum_ratio'],
+ 'average_line_length_filter': ['avg_line_length'],
+ 'character_repetition_filter': ['char_rep_ratio'],
+ 'flagged_words_filter': ['flagged_words_ratio'],
+ 'language_id_score_filter': ['lang', 'lang_score'],
+ 'maximum_line_length_filter': ['max_line_length'],
+ 'perplexity_filter': ['perplexity'],
+ 'special_characters_filter': ['special_char_ratio'],
+ 'stopwords_filter': ['stopwords_ratio'],
+ 'text_length_filter': ['text_len'],
+ 'words_num_filter': ['num_words'],
+ 'word_repetition_filter': ['word_rep_ratio'],
+}
+
+
+class Visualize:
+
+ @staticmethod
+ def setup():
+ st.set_page_config(
+ page_title='Juicer',
+ page_icon=':smile',
+ layout='wide',
+ # initial_sidebar_state="expanded",
+ )
+
+ readme_link = 'https://code.alibaba-inc.com/DAIL-DATA/' \
+ 'data_juicer/blob/master/README.md'
+ st.markdown(
+ '
Data-Juicer \
+
',
+ unsafe_allow_html=True,
+ )
+ st.markdown(
+ f'
A Data-Centric Text Processing System for \
+ Large Language Models, \
+ see more details in Document
',
+ unsafe_allow_html=True,
+ )
+
+ @staticmethod
+ def draw_stack_bar(bar_sizes, bar_labels, total_num, title=''):
+ filtered_size = [
+ k / total_num * 100 for i, k in enumerate(bar_sizes[::-1])
+ if i % 2 == 0
+ ]
+ retain_size = [
+ k / total_num * 100 for i, k in enumerate(bar_sizes[::-1])
+ if i % 2 != 0
+ ]
+ plt.clf()
+ plt.title(title)
+ bar_labels = bar_labels[::-1]
+ # retained
+ r_bars = plt.barh(bar_labels,
+ retain_size,
+ label='Retained',
+ height=0.5,
+ color='limegreen')
+
+ # filtered
+ f_bars = plt.barh(bar_labels,
+ filtered_size,
+ label='Filtered',
+ left=retain_size,
+ height=0.5,
+ color='orangered')
+
+ for idx, bar in enumerate(r_bars):
+ width = bar.get_width()
+ plt.text(bar.get_x() + width / 2,
+ bar.get_y() + bar.get_height() / 2,
+ f'{retain_size[idx]:.2f}%',
+ ha='center',
+ va='center')
+
+ for idx, bar in enumerate(f_bars):
+ width = bar.get_width()
+ plt.text(bar.get_x() + width / 2,
+ bar.get_y() + bar.get_height() / 2,
+ f'{filtered_size[idx]:.2f}%',
+ ha='center',
+ va='center')
+
+ plt.legend()
+ plt.gcf()
+ st.pyplot(plt, use_container_width=True)
+
+ @staticmethod
+ def display_discarded_ratio(cond, key):
+ if len(cond) > 0:
+ st.caption(
+ f':red[{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}%] \
+ of the total (:red[{len(cond)}]) is discarded with {key}.')
+ else:
+ st.caption(f':red[{0:.2f}%] \
+ of the total (:red[0]) is discarded with {key}.')
+
+ @staticmethod
+ def display_dataset(dataframe, cond, show_num, desp, type, all=True):
+ examples = dataframe.loc[cond]
+ if all or len(examples) > 0:
+ st.subheader(
+ f'{desp}: :red[{len(examples)}] of '
+ f'{len(dataframe.index)} {type} '
+ f'(:red[{len(examples)/len(dataframe.index) * 100:.2f}%])')
+
+ # st.markdown('Click on a column to sort by it, \
+ # place the cursor on the text to display it.')
+ st.dataframe(examples[:show_num], use_container_width=True)
+
+ @staticmethod
+ def draw_hist(data, cutoff=None):
+
+ fig, ax = plt.subplots()
+ data_num = len(data)
+ if data_num >= 100:
+ rec_bins = int(math.sqrt(len(data)))
+ else:
+ rec_bins = 50
+
+ if data_num > 0:
+ ax.hist(data, bins=rec_bins, density=True)
+ if hasattr(data, 'name'):
+ ax.set_title(data.name)
+
+ if isinstance(cutoff, (float, int)):
+ ax.axvline(x=cutoff, color='r', linestyle='dashed')
+ elif isinstance(cutoff, tuple) and len(cutoff) == 2:
+ ax.axvline(x=cutoff[0], color='r', linestyle='dashed')
+ ax.axvline(x=cutoff[1], color='r', linestyle='dashed')
+ st.pyplot(fig)
+
+ @staticmethod
+ def op_effect_analyze():
+ col1, col2, col3 = st.columns(3)
+
+ with col1:
+ label = 'Which language of your dataset'
+ options = ['en', 'zh']
+ lang_select = st.selectbox(
+ label=label,
+ options=options,
+ )
+ st.session_state.lang_select = lang_select
+
+ with col2:
+ dataset_file = st.file_uploader(
+ label='Upload you custom dataset(jsonl/csv)',
+ type=['json', 'jsonl', 'csv'])
+
+ with col3:
+ st.text_area(label='Default Demo dataset',
+ disabled=True,
+ value='data/demo-dataset.jsonl')
+
+ start_btn = st.button('Start to analyze data (per filter op)',
+ use_container_width=True)
+
+ if start_btn:
+ with st.spinner('Wait for analyze...'):
+ analyze_and_show_res(dataset_file)
+
+ with st.expander('Data Analysis Results', expanded=False):
+ original_overall = st.session_state.get('original_overall', None)
+ original_imgs = st.session_state.get('original_imgs', [])
+
+ st.dataframe(original_overall, use_container_width=True)
+ for img in original_imgs:
+ st.image(img, output_format='png')
+ with st.expander('Effect of Filter OPs', expanded=True):
+ dataset = st.session_state.get('dataset', None)
+ if dataset:
+ Visualize.filter_dataset(dataset)
+ else:
+ st.warning('Please analyze data first')
+
+ @staticmethod
+ def filter_dataset(dataset):
+ text = dataset['text']
+ if 'stats' not in dataset.features:
+ stats = pd.DataFrame(dataset['stats.meta'])
+ else:
+ stats = pd.DataFrame(dataset['stats'])
+ stats['text'] = text
+
+ non_num_list = ['lang']
+ min_cutoff_list = [
+ 'lang_score',
+ 'stopwords_ratio',
+ ]
+ max_cutoff_list = [
+ 'flagged_words_ratio',
+ 'max_ppl',
+ ]
+ mask_list = ['text']
+
+ cfg = st.session_state.get('cfg', None)
+ if cfg is None:
+ return
+
+ def set_sliders(total_stats, ordered):
+ stats = copy.deepcopy(total_stats)
+ conds = list()
+ index = 1
+ for op_cfg in cfg.process:
+ op_name = list(op_cfg.keys())[0]
+ op_stats = op_stats_dict.get(op_name, [])
+
+ cutoff_ratio = None
+
+ with st.sidebar.expander(f'{index} {op_name}'):
+
+ for column_name in op_stats:
+ if column_name not in stats:
+ continue
+ data = stats[column_name]
+
+ if column_name in non_num_list:
+ options = ['all'] + list(set(data))
+ label = f'Which {column_name} would \
+ you like to keep?'
+
+ selected = st.selectbox(
+ label=label,
+ options=options,
+ )
+ if selected == 'all':
+ cond = [True] * len(data)
+ else:
+ cond = data == selected
+ Visualize.display_discarded_ratio(
+ cond, column_name)
+
+ elif column_name in min_cutoff_list:
+ label = f'If the {column_name} of a document \
+ is lower than this number, \
+ the document is removed.'
+
+ low, high, step = get_min_max_step(data)
+
+ cutoff_ratio = st.slider(label,
+ low,
+ high,
+ low,
+ step=step)
+ cond = data >= cutoff_ratio
+ Visualize.display_discarded_ratio(
+ cond, column_name)
+
+ elif column_name in max_cutoff_list:
+ label = f'If the {column_name} of a document \
+ is higher than this number, \
+ the document is removed.'
+
+ low, high, step = get_min_max_step(data)
+ cutoff_ratio = st.slider(label,
+ low,
+ high,
+ high,
+ step=step)
+ cond = data <= cutoff_ratio
+
+ Visualize.display_discarded_ratio(
+ cond, column_name)
+ elif column_name not in mask_list:
+ # lower
+ label = f'If the {column_name} of a document \
+ is lower than this number, \
+ the document is removed.'
+
+ low, high, step = get_min_max_step(data)
+
+ cutoff_ratio_l = st.slider(label,
+ low,
+ high,
+ low,
+ step=step)
+ cond_l = data >= cutoff_ratio_l
+
+ Visualize.display_discarded_ratio(
+ cond_l, column_name)
+
+ # higher
+ label = f'If the {column_name} of a document \
+ is higher than this number, \
+ the document is removed.'
+
+ cutoff_ratio_h = st.slider(label,
+ low,
+ high,
+ high,
+ step=step)
+
+ cond_h = data <= cutoff_ratio_h
+ Visualize.display_discarded_ratio(
+ cond_h, column_name)
+ cond = [
+ low & high
+ for low, high in zip(cond_l, cond_h)
+ ]
+
+ cutoff_ratio = (cutoff_ratio_l, cutoff_ratio_h)
+
+ if column_name not in mask_list:
+ Visualize.draw_hist(data, cutoff_ratio)
+ conds.append({
+ (' '.join([str(index), op_name]), column_name):
+ cond
+ })
+
+ if ordered:
+ stats = stats.loc[cond]
+ index += 1
+ return conds, stats
+
+ st.subheader('How many samples do you want to show?')
+ show_num = st.number_input(
+ label='How many samples do you want to show?',
+ value=5,
+ label_visibility='hidden')
+
+ st.sidebar.subheader('Parameters of filter ops')
+ ordered = st.sidebar.checkbox('Process by op order')
+ conds, filtered_stats = set_sliders(stats, ordered)
+
+ if ordered:
+ all_conds = [
+ True if i in filtered_stats.index else False
+ for i in range(len(stats))
+ ]
+ else:
+ all_conds = np.all([list(cond.values())[0] for cond in conds],
+ axis=0)
+
+ ds = pd.DataFrame(dataset)
+ Visualize.display_dataset(ds, all_conds, show_num, 'Retained sampels',
+ 'docs')
+ st.download_button('Download Retained data as JSONL',
+ data=convert_jsonl(ds.loc[all_conds]),
+ file_name='retained.jsonl')
+ Visualize.display_dataset(ds, np.invert(all_conds), show_num,
+ 'Discarded sampels', 'docs')
+ st.download_button('Download Discarded data as JSONL',
+ data=convert_jsonl(ds.loc[np.invert(all_conds)]),
+ file_name='discarded.jsonl')
+ display_discarded_details = st.checkbox(
+ 'Display discarded documents by filter details')
+
+ show_stats = copy.deepcopy(stats)
+ bar_labels = []
+ bar_sizes = []
+ for item in conds:
+ for op_key, cond in item.items():
+ op_name, column_name = op_key
+ if column_name not in mask_list:
+ sub_stats = show_stats[[column_name, 'text']]
+ if display_discarded_details:
+ Visualize.display_dataset(
+ sub_stats,
+ np.invert(cond) if len(cond) > 0 else [],
+ show_num,
+ # f'Discarded documents for the filter on \
+ f'{op_name} {column_name} filtered ',
+ 'docs',
+ )
+ before_filtered_num = len(show_stats.index)
+ if ordered:
+ show_stats = show_stats.loc[cond]
+ retained = np.sum(1 * cond)
+ filtered = before_filtered_num - len(show_stats.index)
+ else:
+ retained = np.sum(1 * cond)
+ filtered = before_filtered_num - retained
+
+ bar_sizes.append(retained)
+ bar_sizes.append(filtered)
+ bar_labels.append(f'{op_name}\n{column_name}')
+
+ bar_title = 'Effect of Filter OPs'
+ Visualize.draw_stack_bar(bar_sizes, bar_labels, len(stats.index),
+ bar_title)
+
+ @staticmethod
+ def visualize():
+ Visualize.setup()
+ Visualize.op_effect_analyze()
+
+
+def main():
+ Visualize.visualize()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/demos/data_visualization_op_effect/configs/demo_en.yaml b/demos/data_visualization_op_effect/configs/demo_en.yaml
new file mode 100644
index 000000000..d71266901
--- /dev/null
+++ b/demos/data_visualization_op_effect/configs/demo_en.yaml
@@ -0,0 +1,66 @@
+# Process config example for dataset
+
+# global parameters
+project_name: 'demo'
+dataset_path: './data/demo-dataset.jsonl' # path to your dataset directory or file
+np: 1 # number of subprocess to process your dataset
+
+export_path: './outputs/demo/demo-processed.jsonl'
+
+# process schedule
+# a list of several process operators with their arguments
+process:
+ # Filter ops
+ - alphanumeric_filter: # filter text with alphabet/numeric ratio out of specific range.
+ tokenization: false # Whether to count the ratio of alphanumeric to the total number of tokens.
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.9 # the max ratio of filter range
+ - average_line_length_filter: # filter text with the average length of lines out of specific range.
+ min_len: 10 # the min length of filter range
+ max_len: 10000 # the max length of filter range
+ - character_repetition_filter: # filter text with the character repetition ratio out of specific range
+ rep_len: 10 # repetition length for char-level n-gram
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.5 # the max ratio of filter range
+ - flagged_words_filter: # filter text with the flagged-word ratio larger than a specific max value
+ lang: en # consider flagged words in what language
+ tokenization: false # whether to use model to tokenize documents
+ max_ratio: 0.0045 # the max ratio to filter text
+ flagged_words_dir: ./assets # directory to store flagged words dictionaries
+ use_words_aug: false # whether to augment words, especially for Chinese and Vietnamese
+ words_aug_group_sizes: [2] # the group size of words to augment
+ words_aug_join_char: "" # the join char between words to augment
+ - language_id_score_filter: # filter text in specific language with language scores larger than a specific max value
+ lang: en # keep text in what language
+ min_score: 0.8 # the min language scores to filter text
+ - maximum_line_length_filter: # filter text with the maximum length of lines out of specific range
+ min_len: 10 # the min length of filter range
+ max_len: 10000 # the max length of filter range
+ - perplexity_filter: # filter text with perplexity score out of specific range
+ lang: en # compute perplexity in what language
+ max_ppl: 1500 # the max perplexity score to filter text
+ - special_characters_filter: # filter text with special-char ratio out of specific range
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.25 # the max ratio of filter range
+ - stopwords_filter: # filter text with stopword ratio smaller than a specific min value
+ lang: en # consider stopwords in what language
+ tokenization: false # whether to use model to tokenize documents
+ min_ratio: 0.3 # the min ratio to filter text
+ stopwords_dir: ./assets # directory to store stopwords dictionaries
+ use_words_aug: false # whether to augment words, especially for Chinese and Vietnamese
+ words_aug_group_sizes: [2] # the group size of words to augment
+ words_aug_join_char: "" # the join char between words to augment
+ - text_length_filter: # filter text with length out of specific range
+ min_len: 10 # the min length of filter range
+ max_len: 10000 # the max length of filter range
+ - words_num_filter: # filter text with number of words out of specific range
+ lang: en # sample in which language
+ tokenization: false # whether to use model to tokenize documents
+ min_num: 10 # the min number of filter range
+ max_num: 10000 # the max number of filter range
+ - word_repetition_filter: # filter text with the word repetition ratio out of specific range
+ lang: en # sample in which language
+ tokenization: false # whether to use model to tokenize documents
+ rep_len: 10 # repetition length for word-level n-gram
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.5 # the max ratio of filter range
diff --git a/demos/data_visualization_op_effect/configs/demo_zh.yaml b/demos/data_visualization_op_effect/configs/demo_zh.yaml
new file mode 100644
index 000000000..c07ea1cf9
--- /dev/null
+++ b/demos/data_visualization_op_effect/configs/demo_zh.yaml
@@ -0,0 +1,66 @@
+# Process config example for dataset
+
+# global parameters
+project_name: 'demo'
+dataset_path: './data/demo-dataset.jsonl' # path to your dataset directory or file
+np: 1 # number of subprocess to process your dataset
+
+export_path: './outputs/demo/demo-processed.jsonl'
+
+# process schedule
+# a list of several process operators with their arguments
+process:
+ # Filter ops
+ - alphanumeric_filter: # filter text with alphabet/numeric ratio out of specific range.
+ tokenization: false # Whether to count the ratio of alphanumeric to the total number of tokens.
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.9 # the max ratio of filter range
+ - average_line_length_filter: # filter text with the average length of lines out of specific range.
+ min_len: 10 # the min length of filter range
+ max_len: 10000 # the max length of filter range
+ - character_repetition_filter: # filter text with the character repetition ratio out of specific range
+ rep_len: 10 # repetition length for char-level n-gram
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.5 # the max ratio of filter range
+ - flagged_words_filter: # filter text with the flagged-word ratio larger than a specific max value
+ lang: zh # consider flagged words in what language
+ tokenization: true # whether to use model to tokenize documents
+ max_ratio: 0.0045 # the max ratio to filter text
+ flagged_words_dir: ./assets # directory to store flagged words dictionaries
+ use_words_aug: true # whether to augment words, especially for Chinese and Vietnamese
+ words_aug_group_sizes: [2] # the group size of words to augment
+ words_aug_join_char: "" # the join char between words to augment
+ - language_id_score_filter: # filter text in specific language with language scores larger than a specific max value
+ lang: zh # keep text in what language
+ min_score: 0.8 # the min language scores to filter text
+ - maximum_line_length_filter: # filter text with the maximum length of lines out of specific range
+ min_len: 10 # the min length of filter range
+ max_len: 10000 # the max length of filter range
+ - perplexity_filter: # filter text with perplexity score out of specific range
+ lang: zh # compute perplexity in what language
+ max_ppl: 1500 # the max perplexity score to filter text
+ - special_characters_filter: # filter text with special-char ratio out of specific range
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.25 # the max ratio of filter range
+ - stopwords_filter: # filter text with stopword ratio smaller than a specific min value
+ lang: zh # consider stopwords in what language
+ tokenization: true # whether to use model to tokenize documents
+ min_ratio: 0.3 # the min ratio to filter text
+ stopwords_dir: ./assets # directory to store stopwords dictionaries
+ use_words_aug: true # whether to augment words, especially for Chinese and Vietnamese
+ words_aug_group_sizes: [2] # the group size of words to augment
+ words_aug_join_char: "" # the join char between words to augment
+ - text_length_filter: # filter text with length out of specific range
+ min_len: 10 # the min length of filter range
+ max_len: 10000 # the max length of filter range
+ - words_num_filter: # filter text with number of words out of specific range
+ lang: zh # sample in which language
+ tokenization: true # whether to use model to tokenize documents
+ min_num: 10 # the min number of filter range
+ max_num: 10000 # the max number of filter range
+ - word_repetition_filter: # filter text with the word repetition ratio out of specific range
+ lang: zh # sample in which language
+ tokenization: true # whether to use model to tokenize documents
+ rep_len: 10 # repetition length for word-level n-gram
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.5 # the max ratio of filter range
diff --git a/demos/data_visualization_op_effect/data/demo-dataset.jsonl b/demos/data_visualization_op_effect/data/demo-dataset.jsonl
new file mode 100644
index 000000000..4d8cdadfd
--- /dev/null
+++ b/demos/data_visualization_op_effect/data/demo-dataset.jsonl
@@ -0,0 +1,6 @@
+{"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}}
+{"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}}
+{"text": "你好,请问你是谁", "meta": {"src": "customized", "author": "xxx"}}
+{"text": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "meta": {"src": "Oscar", "version": "2.0"}}
+{"text": "欢迎来到阿里巴巴!", "meta": {"src": "customized", "version": "0.1", "author": "xxx"}}
+{"text": "This paper proposed a novel method on LLM pretraining.", "meta": {"src": "customized", "author": "xxx"}}
diff --git a/demos/data_visualization_statistics/.DS_Store b/demos/data_visualization_statistics/.DS_Store
new file mode 100644
index 000000000..6a4643e38
Binary files /dev/null and b/demos/data_visualization_statistics/.DS_Store differ
diff --git a/demos/data_visualization_statistics/app.py b/demos/data_visualization_statistics/app.py
new file mode 100644
index 000000000..e019b290b
--- /dev/null
+++ b/demos/data_visualization_statistics/app.py
@@ -0,0 +1,176 @@
+import os
+
+import pandas as pd
+import streamlit as st
+import yaml
+from loguru import logger
+
+from data_juicer.config import init_configs
+from data_juicer.core import Analyser
+from data_juicer.ops.base_op import OPERATORS
+
+
+@st.cache_data
+def convert_csv(df):
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
+ return df.to_csv().encode('utf-8')
+
+
+@st.cache_data
+def convert_jsonl(df):
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
+ return df.to_json(orient='records', lines=True).encode('utf-8')
+
+
+def pretty_out(d):
+ res = ''
+ process = ''
+ op_names = set(OPERATORS.modules.keys())
+ for key, value in d.items():
+ if key == 'process':
+ process = yaml.dump(value,
+ allow_unicode=True,
+ default_flow_style=False)
+ elif key == 'config' or key.split('.')[0] in op_names:
+ continue
+ else:
+ res += f'{key}:\n \t {value}\n'
+ res += 'process:\n' + \
+ '\n'.join(['\t' + line for line in process.splitlines()])
+
+ return res
+
+
+def parse_cfg():
+
+ cfg_cmd = '--config configs/demo.yaml'
+
+ args_in_cmd = cfg_cmd.split()
+
+ if len(args_in_cmd) >= 2 and args_in_cmd[0] == '--config':
+ cfg_f_name = args_in_cmd[1]
+ else:
+ st.warning('Please specify a config command or upload a config file.')
+ st.stop()
+
+ if not os.path.exists(cfg_f_name):
+ st.warning('do not parse'
+ f'config file does not exist with cfg_f_name={cfg_f_name}')
+ st.stop()
+
+ with open(cfg_f_name, 'r') as cfg_f:
+ specified_cfg = yaml.safe_load(cfg_f)
+
+ try:
+ parsed_cfg = init_configs(args=args_in_cmd)
+ st.session_state.cfg = parsed_cfg
+
+ return pretty_out(parsed_cfg), pretty_out(specified_cfg), parsed_cfg
+ except Exception as e:
+ return str(e), pretty_out(specified_cfg), None
+
+
+def analyze_and_show_res(dataset_file):
+
+ images_ori = []
+ cfg = st.session_state.get('cfg', parse_cfg()[2])
+ if cfg is None:
+ raise ValueError('you have not specify valid cfg')
+ # force generating separate figures
+ cfg['save_stats_in_one_file'] = True
+
+ del_file = False
+ logger.info('=========Stage: analyze original data=========')
+ if dataset_file is not None:
+
+ file_contents = dataset_file.getvalue()
+ with open(dataset_file.name, 'wb') as f:
+ f.write(file_contents)
+ cfg.dataset_path = dataset_file.name
+ del_file = True
+
+ analyzer = Analyser(cfg)
+ dataset = analyzer.run()
+
+ analysis_res_ori = pd.read_csv(
+ os.path.join(analyzer.analysis_path, 'overall.csv'))
+ for f_path in os.listdir(analyzer.analysis_path):
+ if '.png' in f_path and 'all-stats' in f_path:
+ images_ori.append(os.path.join(analyzer.analysis_path, f_path))
+
+ st.session_state.dataset = dataset
+ st.session_state.original_overall = analysis_res_ori
+ st.session_state.original_imgs = images_ori
+ if del_file:
+ os.remove(dataset_file.name)
+
+
+class Visualize:
+
+ @staticmethod
+ def setup():
+ st.set_page_config(
+ page_title='Data-Juicer',
+ page_icon=':smile',
+ layout='wide',
+ # initial_sidebar_state="expanded",
+ )
+
+ readme_link = 'https://code.alibaba-inc.com/DAIL-DATA/' \
+ 'data_juicer/blob/master/README.md'
+ st.markdown(
+ '
Data-Juicer \
+
',
+ unsafe_allow_html=True,
+ )
+ st.markdown(
+ f'
A Data-Centric Text Processing System for \
+ Large Language Models, \
+ see more details in Document
',
+ unsafe_allow_html=True,
+ )
+
+ @staticmethod
+ def analyze_process():
+ col1, col2 = st.columns(2)
+ with col1:
+ dataset_file = st.file_uploader(
+ label='Upload your custom dataset csv or jsonl',
+ type=['csv', 'json', 'jsonl'])
+ with col2:
+ st.text_area(label='Default Demo dataset',
+ disabled=True,
+ value='demo/demo-dataset.jsonl')
+
+ start_btn = st.button(
+ '2. Start to analyze original data (per filter op)',
+ use_container_width=True)
+
+ with st.expander('Data Analysis Results', expanded=True):
+
+ if start_btn:
+ with st.spinner('Wait for analyze...'):
+ analyze_and_show_res(dataset_file)
+
+ original_overall = st.session_state.get('original_overall', None)
+ original_imgs = st.session_state.get('original_imgs', [])
+
+ st.header('Statistics')
+ st.dataframe(original_overall, use_container_width=True)
+ if len(original_imgs) > 0:
+ st.header('Histograms')
+ for img in original_imgs:
+ st.image(img, output_format='png', use_column_width = True)
+
+ @staticmethod
+ def visualize():
+ Visualize.setup()
+ Visualize.analyze_process()
+
+
+def main():
+ Visualize.visualize()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/demos/data_visualization_statistics/configs/demo.yaml b/demos/data_visualization_statistics/configs/demo.yaml
new file mode 100644
index 000000000..d71266901
--- /dev/null
+++ b/demos/data_visualization_statistics/configs/demo.yaml
@@ -0,0 +1,66 @@
+# Process config example for dataset
+
+# global parameters
+project_name: 'demo'
+dataset_path: './data/demo-dataset.jsonl' # path to your dataset directory or file
+np: 1 # number of subprocess to process your dataset
+
+export_path: './outputs/demo/demo-processed.jsonl'
+
+# process schedule
+# a list of several process operators with their arguments
+process:
+ # Filter ops
+ - alphanumeric_filter: # filter text with alphabet/numeric ratio out of specific range.
+ tokenization: false # Whether to count the ratio of alphanumeric to the total number of tokens.
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.9 # the max ratio of filter range
+ - average_line_length_filter: # filter text with the average length of lines out of specific range.
+ min_len: 10 # the min length of filter range
+ max_len: 10000 # the max length of filter range
+ - character_repetition_filter: # filter text with the character repetition ratio out of specific range
+ rep_len: 10 # repetition length for char-level n-gram
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.5 # the max ratio of filter range
+ - flagged_words_filter: # filter text with the flagged-word ratio larger than a specific max value
+ lang: en # consider flagged words in what language
+ tokenization: false # whether to use model to tokenize documents
+ max_ratio: 0.0045 # the max ratio to filter text
+ flagged_words_dir: ./assets # directory to store flagged words dictionaries
+ use_words_aug: false # whether to augment words, especially for Chinese and Vietnamese
+ words_aug_group_sizes: [2] # the group size of words to augment
+ words_aug_join_char: "" # the join char between words to augment
+ - language_id_score_filter: # filter text in specific language with language scores larger than a specific max value
+ lang: en # keep text in what language
+ min_score: 0.8 # the min language scores to filter text
+ - maximum_line_length_filter: # filter text with the maximum length of lines out of specific range
+ min_len: 10 # the min length of filter range
+ max_len: 10000 # the max length of filter range
+ - perplexity_filter: # filter text with perplexity score out of specific range
+ lang: en # compute perplexity in what language
+ max_ppl: 1500 # the max perplexity score to filter text
+ - special_characters_filter: # filter text with special-char ratio out of specific range
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.25 # the max ratio of filter range
+ - stopwords_filter: # filter text with stopword ratio smaller than a specific min value
+ lang: en # consider stopwords in what language
+ tokenization: false # whether to use model to tokenize documents
+ min_ratio: 0.3 # the min ratio to filter text
+ stopwords_dir: ./assets # directory to store stopwords dictionaries
+ use_words_aug: false # whether to augment words, especially for Chinese and Vietnamese
+ words_aug_group_sizes: [2] # the group size of words to augment
+ words_aug_join_char: "" # the join char between words to augment
+ - text_length_filter: # filter text with length out of specific range
+ min_len: 10 # the min length of filter range
+ max_len: 10000 # the max length of filter range
+ - words_num_filter: # filter text with number of words out of specific range
+ lang: en # sample in which language
+ tokenization: false # whether to use model to tokenize documents
+ min_num: 10 # the min number of filter range
+ max_num: 10000 # the max number of filter range
+ - word_repetition_filter: # filter text with the word repetition ratio out of specific range
+ lang: en # sample in which language
+ tokenization: false # whether to use model to tokenize documents
+ rep_len: 10 # repetition length for word-level n-gram
+ min_ratio: 0.0 # the min ratio of filter range
+ max_ratio: 0.5 # the max ratio of filter range
diff --git a/demos/data_visualization_statistics/data/demo-dataset.jsonl b/demos/data_visualization_statistics/data/demo-dataset.jsonl
new file mode 100644
index 000000000..4d8cdadfd
--- /dev/null
+++ b/demos/data_visualization_statistics/data/demo-dataset.jsonl
@@ -0,0 +1,6 @@
+{"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}}
+{"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}}
+{"text": "你好,请问你是谁", "meta": {"src": "customized", "author": "xxx"}}
+{"text": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.", "meta": {"src": "Oscar", "version": "2.0"}}
+{"text": "欢迎来到阿里巴巴!", "meta": {"src": "customized", "version": "0.1", "author": "xxx"}}
+{"text": "This paper proposed a novel method on LLM pretraining.", "meta": {"src": "customized", "author": "xxx"}}
diff --git a/demos/tool_quality_classifier/.DS_Store b/demos/tool_quality_classifier/.DS_Store
new file mode 100644
index 000000000..5127a64e1
Binary files /dev/null and b/demos/tool_quality_classifier/.DS_Store differ
diff --git a/demos/tool_quality_classifier/app.py b/demos/tool_quality_classifier/app.py
new file mode 100644
index 000000000..6282dc133
--- /dev/null
+++ b/demos/tool_quality_classifier/app.py
@@ -0,0 +1,175 @@
+import os
+
+import streamlit as st
+from loguru import logger
+
+from quality_classifier.qc_utils import (init_spark, load_dataset, predict,
+ prepare_model)
+
+
+@st.cache_data
+def install_jdk():
+
+ os.system('apt update')
+ os.system('apt install -y default-jre')
+ os.system('apt install -y default-jdk')
+ os.system('export JAVA_HOME=/usr/lib/jvm/default-java')
+
+
+@st.cache_data
+def convert_csv(df):
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
+ return df.to_csv().encode('utf-8')
+
+
+@st.cache_data
+def convert_jsonl(df):
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
+ return df.to_json(orient='records', lines=True).encode('utf-8')
+
+
+@st.cache_resource
+def st_init_spark():
+ return init_spark()
+
+
+@st.cache_resource
+def st_prepare_model(model_name):
+ return prepare_model(model_name)
+
+
+def st_load_dataset(spark, ds_path, text_key='text', only_text=False):
+ return load_dataset(spark=spark,
+ ds_path=ds_path,
+ text_key=text_key,
+ only_text=only_text)
+
+
+def st_predict(model, ds, tokenizer=None, keep_method='label'):
+ return predict(model=model,
+ ds=ds,
+ tokenizer=tokenizer,
+ keep_method=keep_method)
+
+def quality_classifier(dataset_file, model):
+
+ del_file = False
+
+ logger.info('=========Stage: analyze original data=========')
+ if dataset_file is not None:
+ file_contents = dataset_file.getvalue()
+ with open(dataset_file.name, 'wb') as f:
+ f.write(file_contents)
+ dataset_path = dataset_file.name
+ del_file = True
+ else:
+ dataset_path = st.session_state.get('default_demo_dataset')
+
+ if model == 'chinese':
+ tokenizer = 'zh.sp.model'
+ keep_method = 'label'
+ if model == 'code':
+ tokenizer = 'code.sp.model'
+ keep_method = 'label'
+ if model == 'gpt3':
+ tokenizer = None
+ keep_method = 'gpt3'
+
+ spark = st_init_spark()
+ model = st_prepare_model(model_name=model)
+ ds = st_load_dataset(spark, dataset_path)
+
+ pred = st_predict(model, ds, tokenizer=tokenizer, keep_method=keep_method)
+ overall = pred.select('doc_score').toPandas().describe(include='all')
+
+ st.session_state.dataset = pred
+ st.session_state.original_overall = overall
+ if del_file:
+ os.remove(dataset_file.name)
+
+
+class Visualize:
+
+ @staticmethod
+ def setup():
+ st.set_page_config(
+ page_title='Data-Juicer',
+ page_icon=':smile',
+ layout='wide',
+ # initial_sidebar_state="expanded",
+ )
+
+ install_jdk()
+
+ readme_link = 'https://code.alibaba-inc.com/DAIL-DATA/' \
+ 'data_juicer/blob/master/README.md'
+ st.markdown(
+ '
Data-Juicer \
+
',
+ unsafe_allow_html=True,
+ )
+ st.markdown(
+ f'
A Data-Centric Text Processing System for \
+ Large Language Models, \
+ see more details in Document
',
+ unsafe_allow_html=True,
+ )
+
+ @staticmethod
+ def quality():
+ col1, col2 = st.columns(2)
+ with col1:
+ dataset_file = st.file_uploader(
+ label='Upload you custom dataset(jsonl/parquet)',
+ type=['json', 'jsonl', 'parquet'])
+
+ st.text_input(label='Default Demo dataset',
+ disabled=True,
+ key = 'default_demo_dataset',
+ value='data/demo-dataset.jsonl')
+ with col2:
+ label = 'Select a quality classifier'
+ quality_model_map = {
+ 'Chinese quality classifier': 'chinese',
+ 'Code quality classifier': 'code',
+ 'GPT3 quality classifier': 'gpt3'
+ }
+
+ selected_model = st.selectbox(label=label,
+ options=list(
+ quality_model_map.keys()))
+ model_name = quality_model_map[selected_model]
+
+ start_btn = st.button(
+ f'2. Start to analyze dataset with {selected_model}',
+ use_container_width=True)
+
+ with st.expander(f'{selected_model} Results', expanded=True):
+
+ if start_btn:
+ with st.spinner('Wait for analyze...'):
+ quality_classifier(dataset_file, model_name)
+
+ col1, col2 = st.columns(2)
+ with col1:
+ original_overall = st.session_state.get(
+ 'original_overall', None)
+ st.header('Statistics')
+ st.dataframe(original_overall, use_container_width=True)
+ with col2:
+ pred = st.session_state.get('dataset', None)
+ st.header('Details')
+ st.dataframe(pred, use_container_width=True)
+
+ @staticmethod
+ def visualize():
+ Visualize.setup()
+ Visualize.quality()
+
+
+def main():
+ Visualize.visualize()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/demos/tool_quality_classifier/data/demo-dataset.jsonl b/demos/tool_quality_classifier/data/demo-dataset.jsonl
new file mode 100644
index 000000000..14aa71f9a
--- /dev/null
+++ b/demos/tool_quality_classifier/data/demo-dataset.jsonl
@@ -0,0 +1,11 @@
+{"text":"What’s one thing you wish everyone knew about the brain?\nibble\nWhat’s one thing you wish everyone knew about the brain?\nThe place to have real conversations and understand each other better. Join a community or build and grow your own with groups, threads, and conversations.\nSee this content immediately after install\nGet The App\n"}
+{"text":"JavaScript must be enabled to use the system\n"}
+{"text":"中国企业又建成一座海外三峡工程!-科技-高清完整正版视频在线观看-优酷\n"}
+{"text":"Skip to content\nPOLIDEPORTES\nPeriodismo especialzado en deportes\nPrimary Menu\nPOLIDEPORTES\nPolideportes\n¿Quiénes somos?\nNoticia\nEntrevistas\nReportaje\nEquipos de Época\nOpinión\nEspeciales\nCopa Poli\nBuscar:\nSteven Villegas Ceballos patinador\nShare this...\nFacebook\nTwitter\nLinkedin\nWhatsapp\nEmail\nSeguir leyendo\nAnterior El imparable campeón Steven Villegas\nTe pueden interesar\nDeportes\nNoticia\nPiezas filatélicas llegan al Museo Olímpico Colombiano\nmarzo 17, 2023"}
+{"text":"Redirect Notice\nRedirect Notice\nThe previous page is sending you to http:\/\/sieuthikhoavantay.vn\/chi-tiet\/khoa-van-tay-dessmann-s710fp-duc.\nIf you do not want to visit that page, you can return to the previous page.\n"}
+{"text": "Do you need a cup of coffee?"}
+{"text": ".cv域名是因特网域名管理机构ICANN为佛得角共和国(The Republic of Cape Verde República de Cabo Verde)国家及地区分配的顶级域(ccTLD),作为其国家及地区因特网顶级域名。- 奇典网络\n专业的互联网服务提供商 登录 注册 控制中心 新闻中心 客户支持 交费方式 联系我们\n首页\n手机AI建站\n建站\n推广\n域名\n主机\n安全\n企业服务\n加盟\nICANN与CNNIC双认证顶级注册商 在中国,奇典网络是域名服务提供商\n.cv\n.cv域名是ICANN为佛得角共和国国家及地区分配的顶级域名,注册期限1年到10年不等。\n价格: 845 元\/1年\n注册要求: 无要求\n.cv\/.com.cv注册要求\n更多国别域名\n更多NewG域名\n相关资质\n1.什么是 .cv\/.com.cv域名?有什么优势?\n.cv域名是因特网域名管理机构ICANN为佛得角共和国(The Republic of Cape Verde República de Cabo Verde)国家及地区分配的顶级域(ccTLD),作为其国家及地区因特网顶级域名。\n2.cv\/.com.cv域名长度为多少?有什么注册规则?"}
+{"text": "Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément."}
+{"text": "欢迎来到阿里巴巴!"}
+{"text": "This paper proposed a novel method on LLM pretraining."}
+{"text":"世界十大网投平台_2022年卡塔尔世界杯官网\n177-8228-4819\n网站首页\n关于我们\n产品展示\n广告牌制作 广告灯箱制作 标识牌制作 楼宇亮化工程 门头店招制作 不锈钢金属字制作 LED发光字制作 形象墙Logo墙背景墙制作 LED显示屏制作 装饰装潢工程 铜字铜牌制作 户外广告 亚克力制品 各类广告设计 建筑工地广告制作 楼顶大字制作|楼顶发光字制作 霓虹灯制作 三维扣板|3D扣板|广告扣板 房地产广告制作设计 精神堡垒|立牌|指示牌制作 大型商业喷绘写真 展览展示 印刷服务\n合作伙伴\n新闻资讯\n公司新闻 行业新闻 制作知识 设计知识\n成功案例\n技术园地\n联系方式\n"}
\ No newline at end of file
diff --git a/demos/tool_quality_classifier/quality_classifier/__init__.py b/demos/tool_quality_classifier/quality_classifier/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/demos/tool_quality_classifier/quality_classifier/eval.py b/demos/tool_quality_classifier/quality_classifier/eval.py
new file mode 100644
index 000000000..06eb72069
--- /dev/null
+++ b/demos/tool_quality_classifier/quality_classifier/eval.py
@@ -0,0 +1,92 @@
+# This tool is used for evaluating a quality classifier on your own datasets
+# based on PySpark.
+#
+# We provide several trained models for you. Please refer to the comments at
+# the beginning of predict tool for more details.
+#
+# This tool needs several arguments:
+# - positive_datasets: the paths to the positive datasets. It could be a
+# string for a single dataset, e.g. 'pos.parquet', or a list of strings
+# for several datasets, e.g. '["pos1.parquet", "pos2.parquet"]'.
+# - negative_datasets: the paths to the negative datasets. It could be a
+# string for a single dataset, e.g. 'neg.parquet', or a list of strings
+# for several datasets, e.g. '["neg1.parquet", "neg2.parquet"]'.
+# - model: quality classifier name to apply. It's "gpt3" in default. You can
+# use one of ["gpt3", "chinese", "code"] we provided, or you can set it
+# to the path to your own model trained using the train.py tool.
+# - tokenizer: what tokenizer to use to tokenize texts. It's None in default,
+# which means using the standard Tokenizer of PySpark. You can use one of
+# ["zh.sp.model", "code.sp.model"] we provided, or you can set it to the
+# path to your own sentencepiece model.
+# - text_key: the field key name to hold texts to be classified. It's "text"
+# in default.
+
+import fire
+from loguru import logger
+
+from qc_utils import eval, init_spark, load_datasets
+
+
+@logger.catch
+def main(positive_datasets=None,
+ negative_datasets=None,
+ model='my_quality_model',
+ tokenizer=None,
+ text_key='text'):
+ """
+
+ :param positive_datasets: the paths to the positive datasets. It could be a
+ string for a single dataset, e.g. 'pos.parquet', or a list of strings
+ for several datasets, e.g. '["pos1.parquet", "pos2.parquet"]'.
+ :param negative_datasets: the paths to the negative datasets. It could be a
+ string for a single dataset, e.g. 'neg.parquet', or a list of strings
+ for several datasets, e.g. '["neg1.parquet", "neg2.parquet"]'.
+ :param model: quality classifier name to apply. It's "gpt3" in default. You
+ can use one of ["gpt3", "chinese", "code"] we provided, or you can set
+ it to the path to your own model trained using the train.py tool.
+ :param tokenizer: what tokenizer to use to tokenize texts. It's None in
+ default, which means using the standard Tokenizer of PySpark. You can
+ use one of ["zh.sp.model", "code.sp.model"] we provided, or you can set
+ it to the path to your own sentencepiece model.
+ :param text_key: the field key name to hold texts to be classified. It's
+ "text" in default.
+ :return:
+ """
+ # convert a single dataset to a dataset list
+ if positive_datasets is None:
+ positive_datasets = []
+ if negative_datasets is None:
+ negative_datasets = []
+ if isinstance(positive_datasets, str):
+ positive_datasets = [positive_datasets]
+ if isinstance(negative_datasets, str):
+ negative_datasets = [negative_datasets]
+
+ spark = init_spark()
+
+ pos = load_datasets(spark,
+ positive_datasets,
+ text_key=text_key,
+ label=1,
+ only_text=True)
+ neg = load_datasets(spark,
+ negative_datasets,
+ text_key=text_key,
+ label=0,
+ only_text=True)
+ # merge pos and neg samples
+ if pos is not None and neg is not None:
+ ds = pos.unionAll(neg)
+ elif pos is not None:
+ ds = pos
+ elif neg is not None:
+ ds = neg
+ else:
+ logger.error('Empty dataset.')
+ exit(0)
+ logger.info(f'Number of samples: {ds.count()}')
+ eval(model, ds, tokenizer)
+
+
+if __name__ == '__main__':
+ fire.Fire(main)
diff --git a/demos/tool_quality_classifier/quality_classifier/predict.py b/demos/tool_quality_classifier/quality_classifier/predict.py
new file mode 100644
index 000000000..ddbb084b7
--- /dev/null
+++ b/demos/tool_quality_classifier/quality_classifier/predict.py
@@ -0,0 +1,120 @@
+# This tool is used for predicting a document score for text samples using
+# quality classifier models we provided, including:
+# - gpt3: A GPT3 quality classifier reproduced from scratch by us based on
+# PySpark. It's trained over CC as negative samples and Wikipedia-en,
+# Books, OpenWebText as positive samples.
+# - chinese: A quality classifier for Chinese. It's trained over Chinese
+# texts sampled from CC as negative samples and Wudao, Wikipedia-zh as
+# positive samples.
+# - code: A quality classifier for codes. It's trained over code samples that
+# have stars >= 1372 as positive samples and random samples from left
+# data as negative samples. Stars count 1372 splits a nearly 700w subset
+# with most stars.
+# All these 3 classifiers are trained using the same training pipeline as GPT3
+# based on PySpark but with different tokenizers and keeping methods:
+# - gpt3: standard Tokenizer from spark & GPT3 keeping method based on pareto
+# - chinese: sentencepiece tokenizer for Chinese & label
+# - code: sentencepiece tokenizer for code & label
+#
+# This tool needs several arguments:
+# - dataset_path: the path to the dataset you want to predict doc_scores for.
+# - result_path: the path to store the predicted result dataset.
+# - model: quality classifier name to apply. It's "gpt3" in default. You can
+# use one of ["gpt3", "chinese", "code"] we provided, or you can set it
+# to the path to your own model trained using the train.py tool.
+# - tokenizer: what tokenizer to use to tokenize texts. It's None in default,
+# which means using the standard Tokenizer of PySpark. You can use one of
+# ["zh.sp.model", "code.sp.model"] we provided, or you can set it to the
+# path to your own sentencepiece model.
+# - keep_method: the method to label should_keep field for each sample. It's
+# "gpt3" in default. Should be one of ["gpt3", "label"].
+# - text_key: the field key name to hold texts to be classified. It's "text"
+# in default.
+# - overall_statics: whether to output an overall statics report on predicted
+# document scores. It's False in default.
+#
+# Recommended arguments for provided trained models:
+# - gpt3:
+# - model: gpt3
+# - tokenizer: None
+# - keep_method: gpt3
+# - chinese:
+# - model: chinese
+# - tokenizer: zh.sp.model
+# - keep_method: label
+# - code:
+# - model: code
+# - tokenizer: code.sp.model
+# - keep_method: label
+#
+# Notice:
+# 1. The configs of SparkSession in function init_spark can be modified to be
+# more suitable for your own machine. See function init_spark in
+# qc_utils.py.
+# 2. Random factors are involved in "gpt3" model. So you might get different
+# should_keep label in different running processes. But you should get
+# same doc_score predictions in different running processes.
+
+import os
+
+import fire
+from loguru import logger
+
+from qc_utils import (export_result, init_spark, load_dataset, predict,
+ prepare_model)
+
+
+@logger.catch
+def main(dataset_path,
+ result_path,
+ model='gpt3',
+ tokenizer=None,
+ keep_method='gpt3',
+ text_key='text',
+ overall_statics=False):
+ """
+ Apply quality classifier for your dataset.
+ :param dataset_path: the path to the dataset you want to predict for.
+ :param result_path: the path to store the predicted result dataset.
+ :param model: quality classifier name to apply. It's "gpt3" in default. You
+ can use one of ["gpt3", "chinese", "code"] we provided, or you can set
+ it to the path to your own model trained using the train.py tool.
+ :param tokenizer: what tokenizer to use to tokenize texts. It's None in
+ default, which means using the standard Tokenizer of PySpark. You can
+ use one of ["zh.sp.model", "code.sp.model"] we provided, or you can set
+ it to the path to your own sentencepiece model.
+ :param keep_method: the method to label should_keep field for each sample.
+ It's "gpt3" in default. Should be one of ["gpt3", "label"].
+ :param text_key: the field key name to hold texts to be classified. It's
+ "text" in default.
+ :param overall_statics: whether to output an overall statics report on
+ predicted document scores. It's False in default.
+ :return:
+ """
+ # set default tokenizers for default models
+ if model == 'chinese':
+ tokenizer = 'zh.sp.model'
+ keep_method = 'label'
+ if model == 'code':
+ tokenizer = 'code.sp.model'
+ keep_method = 'label'
+ if model == 'gpt3':
+ tokenizer = None
+ keep_method = 'gpt3'
+
+ spark = init_spark()
+ model = prepare_model(model_name=model)
+ ds = load_dataset(spark, dataset_path, text_key=text_key)
+ pred = predict(model, ds, tokenizer=tokenizer, keep_method=keep_method)
+ export_result(pred, result_path)
+
+ # generate overall statistics on doc scores
+ if overall_statics:
+ overall = pred.select('doc_score').toPandas().describe(include='all')
+ # export to result report file
+ overall.to_csv(os.path.join(result_path, 'overall.csv'))
+ overall.to_markdown(os.path.join(result_path, 'overall.md'))
+
+
+if __name__ == '__main__':
+ fire.Fire(main)
diff --git a/demos/tool_quality_classifier/quality_classifier/qc_utils.py b/demos/tool_quality_classifier/quality_classifier/qc_utils.py
new file mode 100644
index 000000000..862e6f1bd
--- /dev/null
+++ b/demos/tool_quality_classifier/quality_classifier/qc_utils.py
@@ -0,0 +1,214 @@
+import os
+import zipfile
+
+import numpy as np
+import sentencepiece as spm
+import wget
+from loguru import logger
+from pyspark.ml import Pipeline, PipelineModel
+from pyspark.ml.classification import LogisticRegression
+from pyspark.ml.feature import HashingTF, Tokenizer
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col, rand, udf
+from pyspark.sql.types import ArrayType, DoubleType, IntegerType, StringType
+
+from data_juicer.utils.cache_utils import DATA_JUICER_MODELS_CACHE
+from data_juicer.utils.model_utils import (MODEL_LINKS,
+ prepare_sentencepiece_model)
+
+
+def init_spark():
+ """
+ Initialize a spark session. You can set parameters such as memory, number
+ of partitions, timeout and so on here.
+ :return: A spark session instance.
+ """
+ spark = (SparkSession.builder.config('spark.driver.memory', '64g').config(
+ 'spark.executor.memory',
+ '64g').config('spark.sql.shuffle.partitions', '300').config(
+ 'spark.sql.execution.arrow.pyspark.enabled',
+ 'true').config('spark.executor.memoryOverhead', '20000').config(
+ 'spark.network.timeout',
+ '10000s').config('spark.executor.heartbeatInterval',
+ '3600s').getOrCreate())
+ logger.info('Spark initialization done.')
+ return spark
+
+
+def prepare_model(model_name, model_path=DATA_JUICER_MODELS_CACHE):
+ udm = False
+ if model_name not in ['gpt3', 'chinese', 'code']:
+ # use user-specific mdoel
+ real_model_path = model_name
+ udm = True
+ else:
+ # use prepared models we provided
+ model_name = '%s_quality_model' % model_name
+ real_model_path = os.path.join(model_path, model_name)
+ logger.info(f'Preparing scorer model in [{real_model_path}]...')
+ if os.path.exists(real_model_path) and os.path.isdir(real_model_path):
+ return PipelineModel.load(real_model_path)
+ if udm:
+ logger.error(f'Customized model [{real_model_path}] cannot be loaded.')
+ exit(0)
+ # No specific models in local file systems. Download them from remote.
+ os.makedirs(model_path, exist_ok=True)
+ wget.download(os.path.join(MODEL_LINKS, f'{model_name}.zip'),
+ os.path.join(model_path, f'{model_name}.zip'),
+ bar=None)
+ with zipfile.ZipFile(os.path.join(model_path, f'{model_name}.zip')) as zip:
+ zip.extractall(os.path.join(model_path))
+ return PipelineModel.load(real_model_path)
+
+
+def load_dataset(spark, ds_path, text_key='text', only_text=False):
+ logger.info(f'Loading dataset from [{ds_path}]...')
+ if ds_path.endswith('.json') or ds_path.endswith('.jsonl'):
+ df = spark.read.json(ds_path)
+ elif ds_path.endswith('.parquet'):
+ df = spark.read.parquet(ds_path)
+ else:
+ raise NotImplementedError('Dataset type is not supported for now. '
+ 'Suffix of dataset file should be one of '
+ '[.json, .jsonl, .parquet]')
+ if text_key != 'text':
+ df = df.withColumnRenamed(text_key, 'text')
+ if only_text:
+ return df.select('text')
+ else:
+ return df
+
+
+def load_datasets(spark,
+ ds_paths,
+ text_key='text',
+ label=None,
+ only_text=True):
+ if len(ds_paths) == 0:
+ logger.warning('No dataset path provided.')
+ return None
+ base_ds = load_dataset(spark, ds_paths[0], text_key, only_text)
+ for i in range(1, len(ds_paths)):
+ base_ds = base_ds.unionAll(
+ load_dataset(spark, ds_paths[i], text_key, only_text))
+ if label is not None:
+ # add labels for training pipelines
+ return base_ds.selectExpr('text', '%d as label' % label)
+ else:
+ return base_ds
+
+
+def shuffle(df):
+ temp_df = df.withColumn('rand', rand(seed=42))
+ df_rnd = temp_df.orderBy(temp_df.rand)
+ return df_rnd.drop(df_rnd.rand)
+
+
+def export_result(ds, res_path):
+ logger.info(f'Exporting predicted result to [{res_path}]')
+ if res_path.endswith('.json') or res_path.endswith('.jsonl'):
+ ds.write.mode('overwrite').format('json').save(res_path)
+ elif res_path.endswith('.parquet'):
+ ds.write.mode('overwrite').format('parquet').save(res_path)
+ else:
+ ds.write.mode('overwrite').save(res_path)
+
+
+def get_keep_method_udf(keep_method):
+ if keep_method == 'label':
+ return udf(lambda score: int(score > 0.5), IntegerType())
+ elif keep_method == 'gpt3':
+ pareto = 9
+ return udf(lambda score: int(score > 1 - np.random.pareto(pareto)),
+ IntegerType())
+ else:
+ raise NotImplementedError(f'Keep method [{keep_method}] is not '
+ f'implemented for now.')
+
+
+def tokenize_dataset(ds, tokenizer):
+ if os.path.exists(tokenizer):
+ # if it's a local model
+ tkn = spm.SentencePieceProcessor()
+ tkn.load(tokenizer)
+ else:
+ # else, try to load it from our remote model list
+ tkn = prepare_sentencepiece_model(tokenizer, ())
+ tokenizer_udf = udf(lambda text: tkn.encode_as_pieces(text),
+ ArrayType(StringType()))
+ logger.info('Tokenize texts using specific tokenizer...')
+ return ds.withColumn('words', tokenizer_udf(col('text')))
+
+
+def train(output_model_path, ds, tokenizer=None):
+ logger.info('Preparing training quality classifier model...')
+ if tokenizer:
+ # tokenizer is not standard Tokenizer in PySpark, need to apply it
+ # explicitly
+ ds = tokenize_dataset(ds, tokenizer)
+
+ # model
+ hashingTF = HashingTF(inputCol='words', outputCol='features')
+ lr = LogisticRegression()
+ if tokenizer is None:
+ std_tokenizer = Tokenizer(inputCol='text', outputCol='words')
+ pipeline = Pipeline(stages=[std_tokenizer, hashingTF, lr])
+ else:
+ pipeline = Pipeline(stages=[hashingTF, lr])
+
+ logger.info('Start training...')
+ model = pipeline.fit(ds)
+
+ logger.info('Trained model saving...')
+ model.write().overwrite().save(output_model_path)
+
+
+def eval(model_path, ds, tokenizer=None):
+ logger.info('Preparing to evaluate...')
+ if tokenizer:
+ # tokenizer is not standard Tokenizer in PySpark, need to apply it
+ # explicitly
+ ds = tokenize_dataset(ds, tokenizer)
+
+ logger.info('Start evaluation...')
+ model = prepare_model(model_path)
+ pred = model.transform(ds)
+ P = pred.filter('label = 1')
+ N = pred.filter('label = 0')
+ TP = P.filter('prediction = 1').count() + 1
+ FP = N.filter('prediction = 1').count() + 1
+ TN = N.filter('prediction = 0').count() + 1
+ FN = P.filter('prediction = 0').count() + 1
+ precision = 1.0 * TP / (TP + FP)
+ recall = 1.0 * TP / P.count()
+ F1 = 2.0 * precision * recall / (precision + recall)
+ logger.info(f'TP: {TP}, FN: {FN}')
+ logger.info(f'FP: {FP}, TN: {TN}')
+ logger.info(f'P: {precision}, R: {recall}, F1: {F1}')
+
+def predict(model, ds, tokenizer=None, keep_method='label'):
+ logger.info('Start scoring dataset...')
+ if tokenizer:
+ # tokenizer is not standard Tokenizer in PySpark, need to apply it
+ # explicitly
+ ds = tokenize_dataset(ds, tokenizer)
+
+ prediction = model.transform(ds)
+
+ # A UDF to extract doc scores from probability vectors
+ def extract_prob(v):
+ try:
+ return float(v[1])
+ except ValueError:
+ return None
+
+ extract_prob_udf = udf(extract_prob, DoubleType())
+ doc_score = prediction.withColumn('doc_score',
+ extract_prob_udf(col('probability')))
+
+ # A UDF to get the bool value indicating whether this sample should be kept
+ should_keep_label_udf = get_keep_method_udf(keep_method)
+ should_keep = doc_score.withColumn('should_keep',
+ should_keep_label_udf(col('doc_score')))
+ return should_keep.drop('words', 'features', 'rawPrediction',
+ 'probability', 'prediction')
diff --git a/demos/tool_quality_classifier/quality_classifier/train.py b/demos/tool_quality_classifier/quality_classifier/train.py
new file mode 100644
index 000000000..ea4459c69
--- /dev/null
+++ b/demos/tool_quality_classifier/quality_classifier/train.py
@@ -0,0 +1,113 @@
+# This tool is used for training a quality classifier for your own datasets
+# based on PySpark.
+#
+# After training, this tool will generate a classifier model in a specific
+# directory. You can use it to evaluate or predict on other datasets using eval
+# and predict tools.
+#
+# This tool needs several arguments:
+# - positive_datasets: the paths to the positive datasets. It could be a
+# string for a single dataset, e.g. 'pos.parquet', or a list of strings
+# for several datasets, e.g. '["pos1.parquet", "pos2.parquet"]'.
+# - negative_datasets: the paths to the negative datasets. It could be a
+# string for a single dataset, e.g. 'neg.parquet', or a list of strings
+# for several datasets, e.g. '["neg1.parquet", "neg2.parquet"]'.
+# - output_model_path: the path to store the trained quality classifier. It's
+# "my_quality_model" in default.
+# - num_training_samples: number of samples used to train the model. It's 0
+# in default, which means using all samples in datasets to train.
+# - train_test_split_ratio: ratio to split train and test set. It's 0.8 in
+# default.
+# - tokenizer: what tokenizer to use to tokenize texts. It's None in default,
+# which means using the standard Tokenizer of PySpark. You can use one of
+# ["zh.sp.model", "code.sp.model"] we provided, or you can set it to the
+# path to your own sentencepiece model.
+# - evaluation: whether to evaluate the model after training using test set.
+# It's True in default.
+# - text_key: the field key name to hold texts to be classified. It's "text"
+# in default.
+
+import fire
+from loguru import logger
+
+from qc_utils import eval, init_spark, load_datasets, shuffle, train
+
+
+@logger.catch
+def main(positive_datasets,
+ negative_datasets,
+ output_model_path='my_quality_model',
+ num_training_samples=0,
+ train_test_split_ratio=0.8,
+ tokenizer=None,
+ evaluation=True,
+ text_key='text'):
+ """
+ Train a quality classifier using your own pos/neg datasets.
+ :param positive_datasets: the paths to the positive datasets. It could be a
+ string for a single dataset, e.g. 'pos.parquet', or a list of strings
+ for several datasets, e.g. '["pos1.parquet", "pos2.parquet"]'.
+ :param negative_datasets: the paths to the negative datasets. It could be a
+ string for a single dataset, e.g. 'neg.parquet', or a list of strings
+ for several datasets, e.g. '["neg1.parquet", "neg2.parquet"]'.
+ :param output_model_path: the path to store the trained quality classifier.
+ It's "my_quality_model" in default.
+ :param num_training_samples: number of samples used to train the model.
+ It's 0 in default, which means using all samples in datasets to train.
+ :param train_test_split_ratio: ratio to split train and test set. It's 0.8
+ in default.
+ :param tokenizer: what tokenizer to use to tokenize texts. It's None in
+ default, which means using the standard Tokenizer of PySpark. You can
+ use one of ["zh.sp.model", "code.sp.model"] we provided, or you can set
+ it to the path to your own sentencepiece model.
+ :param evaluation: whether to evaluate the model after training using test
+ set. It's True in default.
+ :param text_key: the field key name to hold texts to be classified. It's
+ "text" in default.
+ :return:
+ """
+ # convert a single dataset to a dataset list
+ if isinstance(positive_datasets, str):
+ positive_datasets = [positive_datasets]
+ if isinstance(negative_datasets, str):
+ negative_datasets = [negative_datasets]
+
+ spark = init_spark()
+
+ pos = load_datasets(spark,
+ positive_datasets,
+ text_key=text_key,
+ label=1,
+ only_text=True)
+ neg = load_datasets(spark,
+ negative_datasets,
+ text_key=text_key,
+ label=0,
+ only_text=True)
+
+ if pos is None or neg is None:
+ logger.error('Empty dataset in positive/negative dataset list...')
+ exit(1)
+
+ if num_training_samples > 0:
+ logger.info(f'Only use {num_training_samples} pairs samples to train.')
+ pos = shuffle(pos).limit(num_training_samples)
+ neg = shuffle(neg).limit(num_training_samples)
+
+ # merge pos and neg samples
+ ds = pos.unionAll(neg)
+ train_set, test_set = ds.randomSplit(
+ [train_test_split_ratio, 1.0 - train_test_split_ratio], seed=42)
+
+ logger.info(f'Number of training samples: {train_set.count()}, '
+ f'test samples: {test_set.count()}')
+
+ # ML pipeline
+ train(output_model_path, train_set, tokenizer)
+
+ if evaluation:
+ eval(output_model_path, test_set, tokenizer)
+
+
+if __name__ == '__main__':
+ fire.Fire(main)
diff --git a/docs/.DS_Store b/docs/.DS_Store
new file mode 100644
index 000000000..f9a28fc79
Binary files /dev/null and b/docs/.DS_Store differ
diff --git a/docs/DeveloperGuide.md b/docs/DeveloperGuide.md
new file mode 100644
index 000000000..d00441133
--- /dev/null
+++ b/docs/DeveloperGuide.md
@@ -0,0 +1,205 @@
+# How-to Guide for Developers
+
+* [How-to Guide for Developers](#how-to-guide-for-developers)
+ * [Coding Style](#coding-style)
+ * [Build your own ops](#build-your-own-ops)
+ * [Build your own configs](#build-your-own-configs)
+ * [Fruitful config sources & Type hints](#fruitful-config-sources--type-hints)
+ * [Hierarchical configs and helps](#hierarchical-configs-and-helps)
+
+## Coding Style
+
+We define our styles in `.pre-commit-config.yaml`. Before committing,
+please install `pre-commit` tool to check and modify accordingly:
+
+```shell
+# ===========install pre-commit tool===========
+pip install pre-commit
+
+cd
+# install pre-commit script for data_juicer
+pre-commit install
+
+
+# ===========check all files===========
+git add .
+pre-commit run --all-files
+
+# commit after all checking are passed
+git commit -m "xxxx"
+```
+
+## Build your own ops
+
+- Data-Juicer allows everybody to build their own ops.
+- Before implementing a new op, please refer to [Operators](Operators.md) to avoid unnecessary duplication.
+- Assuming we want to add a new Filter operator called "TextLengthFilter" to get corpus of expected text length, we can follow these steps to build it.
+
+1. Create a new op file `text_length_filter.py` in the corresponding `data_juicer/ops/filter/` directory as follows.
+ - Because it's a Filter op, so the new op needs to inherit from the basic `Filter` class in the `base_op.py`, and be decorated with `OPERATORS` to register itself automatically.
+
+```python
+import sys
+
+from jsonargparse.typing import PositiveInt
+
+from ..base_op import OPERATORS, Filter
+
+
+@OPERATORS.register_module('text_length_filter')
+class TextLengthFilter(Filter):
+ """
+Filter to keep samples with total text length within a specific range.
+ """
+
+ def __init__(
+ self,
+ min_len: PositiveInt = 10,
+ max_len: PositiveInt = sys.maxsize,
+ *args,
+ **kwargs
+ ):
+ """
+ Initialization method.
+ :param min_len: The min text length in the filtering.
+ :param max_len: The max text length in the filtering.
+ """
+ super().__init__(*args, **kwargs)
+ self.min_len = min_len
+ self.max_len = max_len
+
+ def compute_stats(self, sample):
+ # check if it's computed already
+ if 'text_len' in sample['stats']:
+ return sample
+
+ sample['stats']['text_len'] = len(sample['text'])
+ return sample
+
+ def process(self, sample):
+ if self.min_len <= sample['stats']['text_len'] <= self.max_len:
+ return True
+ else:
+ return False
+```
+
+2. After implemention, add it to the op dictionary in the `__init__.py` file in `data_juicer/ops/filter/` directory.
+
+```python
+from . import (..., # other ops
+ text_length_filter) # import this new op module
+```
+
+3. Now you can use this new op with custom arguments in your own config files!
+
+```yaml
+# other configs
+...
+
+# process configs
+process:
+ - text_length_filter: # add this op to your process list and set the parameters
+ min_len: 10
+ max_len: 1000
+```
+
+4. (Strongly Recommend) It's better to add corresponding tests for your own ops. For `TextLengthFilter` above, you would like to add `test_text_length_filter.py` into `tests/ops/filter/` directory as below.
+
+```python
+import unittest
+from data_juicer.ops.filter.text_length_filter import TextLengthFilter
+
+class TextLengthFilterTest(unittest.TestCase):
+
+ def test_func1(self):
+ pass
+
+ def test_func2(self):
+ pass
+
+ def test_func3(self):
+ pass
+```
+
+## Build your own configs
+- We provide easy configuration based on [jsonargparse](https://github.com/omni-us/jsonargparse/) to reduce cost for boilerplate codes.
+
+### Fruitful config sources & Type hints
+- A global config object can be initialized via
+```
+# core.executor.py
+self.cfg = init_configs()
+```
+- in which function arguments from diverse sources can be specified and mixed
+up, including
+1. *hard-coded default values* when registering the config into parser or specified in the classes' `__init__` functions
+2. default *config files* in json (yaml or jsonnet supersets)
+3. *environment variables*
+4. *POSIX-style command line arguments*, such as ``--project_name
+ my_data_demo`` or ``--project_name=my_data_demo`` , including config files
+
+- The final parsed values are mixed from these sources. And the override order is the same as the numbers above.
+
+Besides, many argument types and respective validation are supported.
+Including python built-in types, types from [Lib/typing](https://docs.python.org/3/library/typing.html) module, and
+extended [types](https://jsonargparse.readthedocs.io/en/stable/#type-hints)
+from jsonargparse, such as `restricted types` and `Paths` with customized
+limitations.
+
+### Hierarchical configs and helps
+- You can use dot notation in the argument names freely to define the
+hierarchy, e.g., `maximum_line_length_filter.min`.
+More importantly, by default, we automatically register the configs from
+the docstrings of implemented operators. That is, the structure of all
+configs are always in sync with codes.
+
+- You can get the hierarchical help information by running a script that calls
+our executor such as
+```
+$ python tools/process_data.py --help
+
+usage: process_data.py [-h] [--config CONFIG] [--print_config[=flags]] [--project_name PROJECT_NAME] [--dataset_path DATASET_PATH] [--dataset_dir DATASET_DIR] [--export_path EXPORT_PATH] [--process PROCESS]
+ [--np NP] [--text_key TEXT_KEY] [--document_deduplicator CONFIG] [--document_deduplicator.hash_method HASH_METHOD] [--document_deduplicator.lowercase LOWERCASE]
+ [--document_deduplicator.ignore_non_character IGNORE_NON_CHARACTER] [--language_id_score_filter CONFIG] [--language_id_score_filter.lang LANG] [--words_num_filter CONFIG] [--words_num_filter.min MIN] [--words_num_filter.max MAX]
+ [--alphanumeric_filter CONFIG] [--alphanumeric_filter.min MIN] [--alphanumeric_filter.max MAX] [--average_line_length_filter CONFIG] [--average_line_length_filter.min MIN] [--average_line_length_filter.max MAX]
+ [--maximum_line_length_filter CONFIG] [--maximum_line_length_filter.min MIN] [--maximum_line_length_filter.max MAX] [--text_length_filter CONFIG] [--text_length_filter.min MIN] [--text_length_filter.max MAX]
+ [--remove_comments_mapper CONFIG] [--remove_comments_mapper.type TYPE] [--remove_comments_mapper.inline INLINE] [--remove_comments_mapper.multiline MULTILINE] [--remove_header_mapper CONFIG]
+ [--remove_header_mapper.before_section BEFORE_SECTION]
+
+optional arguments:
+ -h, --help Show this help message and exit.
+ --config CONFIG Path to a configuration file.
+ --print_config[=flags]
+ Print the configuration after applying all other arguments and exit. The optional flags customizes the output and are one or more keywords separated by comma. The supported flags are: comments, skip_default, skip_null.
+ --project_name PROJECT_NAME
+ name of your data process project. (type: str, default: null)
+ --dataset_path DATASET_PATH
+ path to your dataset file, relative with respect to the config file’s location (type: Path_fr, default: null)
+ --dataset_dir DATASET_DIR
+ path to your dataset(s) within a directory, relative with respect to the config file’s location (type: Path_drw, default: null)
+ --export_path EXPORT_PATH
+ path to the output processed dataset, relative with respect to the config file’s location (type: Path_fc, default: null)
+ --process PROCESS, --process+ PROCESS
+ a list of several process operators with their arguments (type: List[Dict], default: null)
+ --np NP number of subprocess to process your dataset. (type: PositiveInt, default: null)
+ --text_key TEXT_KEY the key name of field that stores sample texts (type: Optional[str], default: content)
+
+:
+ --alphanumeric_filter CONFIG
+ Path to a configuration file.
+ --alphanumeric_filter.min MIN
+ the min filter rate in alphanumeric op. (type: ClosedUnitInterval, default: 0.0)
+ --alphanumeric_filter.max MAX
+ the max filter rate in alphanumeric op. (type: ClosedUnitInterval, default: 0.25)
+
+:
+ --text_length_filter CONFIG
+ Path to a configuration file.
+ --text_length_filter.min MIN
+ min text length in the filtering (type: int, default: 10)
+ --text_length_filter.max MAX
+ max text length in the filtering (type: int, default: 10000)
+
+......
+
+```
diff --git a/docs/DeveloperGuide_ZH.md b/docs/DeveloperGuide_ZH.md
new file mode 100644
index 000000000..080af587e
--- /dev/null
+++ b/docs/DeveloperGuide_ZH.md
@@ -0,0 +1,193 @@
+# 开发者指南
+
+[TOC]
+
+## 编码规范
+
+我们将编码规范定义在 `.pre-commit-config.yaml` 中。在向仓库贡献代码之前,请使用 `pre-commit` 工具对代码进行规范化。
+
+```shell
+# ===========install pre-commit tool===========
+pip install pre-commit
+
+cd
+# install pre-commit script for data_juicer
+pre-commit install
+
+
+# ===========check all files===========
+git add .
+pre-commit run --all-files
+
+# commit after all checking are passed
+git commit -m "xxxx"
+```
+
+## 构建自己的算子
+
+- Data-Juicer 支持每个人定义自己的算子。
+- 在实现新的算子之前,请参考 [Operators](Operators_ZH.md) 以避免不必要的重复。
+- 假设要添加一个名为 “TextLengthFilter” 的运算符以过滤仅包含预期文本长度的样本语料,可以按照以下步骤进行构建。
+
+1. 在 `data_juicer/ops/filter/` 目录下创建一个新的算子文件 `text_length_filter.py`,内容如下:
+ - 因为它是一个 Filter 算子,所以需要继承 `base_op.py` 中的 `Filter` 基类,并用 `OPERATORS` 修饰以实现自动注册。
+
+```python
+import sys
+
+from jsonargparse.typing import PositiveInt
+
+from ..base_op import OPERATORS, Filter
+
+
+@OPERATORS.register_module('text_length_filter')
+class TextLengthFilter(Filter):
+ """
+Filter to keep samples with total text length within a specific range.
+ """
+
+ def __init__(
+ self,
+ min_len: PositiveInt = 10,
+ max_len: PositiveInt = sys.maxsize,
+ ):
+ """
+ Initialization method.
+ :param min_len: The min text length in the filtering.
+ :param max_len: The max text length in the filtering.
+ """
+ self.min_len = min_len
+ self.max_len = max_len
+
+ def compute_stats(self, sample):
+ # check if it's computed already
+ if 'text_len' in sample['stats']:
+ return sample
+
+ sample['stats']['text_len'] = len(sample['text'])
+ return sample
+
+ def process(self, sample):
+ if self.min_len <= sample['stats']['text_len'] <= self.max_len:
+ return True
+ else:
+ return False
+```
+
+2. 实现后,将其添加到 `data_juicer/ops/filter` 目录下 `__init__.py` 文件中的算子字典中:
+
+```python
+from . import (..., # other ops
+ text_length_filter) # import this new op module
+
+```
+
+3. 全部完成!现在您可以在自己的配置文件中使用新添加的算子:
+
+```yaml
+# other configs
+...
+
+# process configs
+process:
+ - text_length_filter: # add this op to your process list and set the parameters
+ min_len: 10
+ max_len: 1000
+```
+
+4. (强烈推荐)最好为新添加的算子进行单元测试。对于上面的 `TextLengthFilter` 算子,建议在 `tests/ops/filter/` 中实现如 `test_text_length_filter.py` 的测试文件:
+
+```python
+import unittest
+from data_juicer.ops.filter.text_length_filter import TextLengthFilter
+
+class TextLengthFilterTest(unittest.TestCase):
+
+ def test_func1(self):
+ pass
+
+ def test_func2(self):
+ pass
+
+ def test_func3(self):
+ pass
+```
+
+## 构建自己的配置
+
+- 我们提供基于 [jsonargparse](https://github.com/omni-us/jsonargparse/) 的简单配置以降低样板代码的成本。
+
+### 丰富的配置源和类型提示
+
+- 全局配置对象可以通过以下方式初始化
+
+```python
+# core.executor.py
+self.cfg = init_configs()
+```
+
+- 其中可以指定和混合来自不同来源的函数参数,包括
+1. *硬编码默认值* 将配置注册到解析器中或在类的 `__init__` 函数中指定
+2. json 格式的默认*配置文件*(yaml 或 jsonnet 超集)
+3. *环境变量*
+4. *POSIX-style 命令行参数*, 例如 `--project_name my_data_demo` 或 `--project_name=my_data_demo`,包含配置文件
+
+- 最终解析的值是来自这些来源的混合。 并且覆盖顺序与上面的数字相同。
+
+此外,还支持许多参数类型和相应的验证。
+包含 Python内置类型、来自 [Lib/typing](https://docs.python.org/3/library/typing.html) 的类型,以及来自 jsonargparse 的 [扩展类型](https://jsonargparse.readthedocs.io/en/stable/#type-hints),例如具有自定义限制的 `restricted types` 和 `Paths`。
+
+### Hierarchical configs and helps
+
+- 您可以在参数名称中自由使用点符号来定义层次结构, 例如 `maximum_line_length_filter.min`.
+更重要的是,默认情况下,我们自动注册已实现的运算符的 docstring。 也就是说,所有的结构配置始终与代码同步。
+- 您可以通过运行脚本来获取层次化的帮助信息,例如:
+
+```
+$ python tools/process_data.py --help
+
+usage: process_data.py [-h] [--config CONFIG] [--print_config[=flags]] [--project_name PROJECT_NAME] [--dataset_path DATASET_PATH] [--dataset_dir DATASET_DIR] [--export_path EXPORT_PATH] [--process PROCESS]
+ [--np NP] [--text_key TEXT_KEY] [--document_deduplicator CONFIG] [--document_deduplicator.hash_method HASH_METHOD] [--document_deduplicator.lowercase LOWERCASE]
+ [--document_deduplicator.ignore_non_character IGNORE_NON_CHARACTER] [--language_id_score_filter CONFIG] [--language_id_score_filter.lang LANG] [--words_num_filter CONFIG] [--words_num_filter.min MIN] [--words_num_filter.max MAX]
+ [--alphanumeric_filter CONFIG] [--alphanumeric_filter.min MIN] [--alphanumeric_filter.max MAX] [--average_line_length_filter CONFIG] [--average_line_length_filter.min MIN] [--average_line_length_filter.max MAX]
+ [--maximum_line_length_filter CONFIG] [--maximum_line_length_filter.min MIN] [--maximum_line_length_filter.max MAX] [--text_length_filter CONFIG] [--text_length_filter.min MIN] [--text_length_filter.max MAX]
+ [--remove_comments_mapper CONFIG] [--remove_comments_mapper.type TYPE] [--remove_comments_mapper.inline INLINE] [--remove_comments_mapper.multiline MULTILINE] [--remove_header_mapper CONFIG]
+ [--remove_header_mapper.before_section BEFORE_SECTION]
+
+optional arguments:
+ -h, --help Show this help message and exit.
+ --config CONFIG Path to a configuration file.
+ --print_config[=flags]
+ Print the configuration after applying all other arguments and exit. The optional flags customizes the output and are one or more keywords separated by comma. The supported flags are: comments, skip_default, skip_null.
+ --project_name PROJECT_NAME
+ name of your data process project. (type: str, default: null)
+ --dataset_path DATASET_PATH
+ path to your dataset file, relative with respect to the config file’s location (type: Path_fr, default: null)
+ --dataset_dir DATASET_DIR
+ path to your dataset(s) within a directory, relative with respect to the config file’s location (type: Path_drw, default: null)
+ --export_path EXPORT_PATH
+ path to the output processed dataset, relative with respect to the config file’s location (type: Path_fc, default: null)
+ --process PROCESS, --process+ PROCESS
+ a list of several process operators with their arguments (type: List[Dict], default: null)
+ --np NP number of subprocess to process your dataset. (type: PositiveInt, default: null)
+ --text_key TEXT_KEY the key name of field that stores sample texts (type: Optional[str], default: content)
+
+:
+ --alphanumeric_filter CONFIG
+ Path to a configuration file.
+ --alphanumeric_filter.min MIN
+ the min filter rate in alphanumeric op. (type: ClosedUnitInterval, default: 0.0)
+ --alphanumeric_filter.max MAX
+ the max filter rate in alphanumeric op. (type: ClosedUnitInterval, default: 0.25)
+
+:
+ --text_length_filter CONFIG
+ Path to a configuration file.
+ --text_length_filter.min MIN
+ min text length in the filtering (type: int, default: 10)
+ --text_length_filter.max MAX
+ max text length in the filtering (type: int, default: 10000)
+
+......
+
+```
diff --git a/docs/Operators.md b/docs/Operators.md
new file mode 100644
index 000000000..df181eb2c
--- /dev/null
+++ b/docs/Operators.md
@@ -0,0 +1,105 @@
+# Operator Schemas
+
+Operators are a collection of basic processes that assist in data modification, cleaning, filtering, deduplication, etc. We support a wide range of data sources and file formats, and allow for flexible extension to custom datasets.
+
+
+## Overview
+
+The operators in Data-Juicer are categorized into 5 types.
+
+| Type | Number | Description |
+|-----------------------------------|:------:|-------------|
+| [ Formatter ]( #formatter ) | 7 | Discovers, loads, and canonicalizes source data |
+| [ Mapper ]( #mapper ) | 17 | Edits and transforms samples |
+| [ Filter ]( #filter ) | 15 | Filters out low-quality samples |
+| [ Deduplicator ]( #deduplicator ) | 3 | Detects and removes duplicate samples |
+| [ Selector ]( #selector ) | 2 | Selects top samples based on ranking |
+
+
+All the specific operators are listed below, each featured with several capability tags.
+
+* Domain Tags
+ - General: general purpose
+ - LaTeX: specific to LaTeX source files
+ - Code: specific to programming codes
+ - Financial: closely related to financial sector
+* Language Tags
+ - en: English
+ - zh: Chinese
+
+
+## Formatter
+
+| Operator | Domain | Lang | Description |
+|-------------------|---------|--------|--------------------------------------------------------------------|
+| remote_formatter | General | en, zh | Prepares datasets from remote (e.g., HuggingFace) |
+| csv_formatter | General | en, zh | Prepares local `.csv` files |
+| tsv_formatter | General | en, zh | Prepares local `.tsv` files |
+| json_formatter | General | en, zh | Prepares local `.json`, `.jsonl`, `.jsonl.zst` files |
+| parquet_formatter | General | en, zh | Prepares local `.parquet` files |
+| text_formatter | General | en, zh | Prepares other local text files ([complete list](data_juicer/format/text_formatter.py#L46,56)) |
+| mixture_formatter | General | en, zh | Handles a mixture of all the supported local file types |
+
+
+## Mapper
+
+| Operator | Domain | Lang | Description |
+|-----------------------------------------------|--------------------|--------|----------------------------------------------------------------------------------------------------------------|
+| remove_header_mapper | LaTeX | en, zh | Removes the running headers of TeX documents, e.g., titles, chapter or section numbers/names |
+| remove_bibliography_mapper | LaTeX | en, zh | Removes the bibliography of TeX documents |
+| expand_macro_mapper | LaTeX | en, zh | Expands macros usually defined at the top of TeX documents |
+| whitespace_normalization_mapper | General | en, zh | Normalizes various Unicode whitespaces to the normal ASCII space (U+0020) |
+| punctuation_normalization_mapper | General | en, zh | Normalizes various Unicode punctuations to their ASCII equivalents |
+| fix_unicode_mapper | General | en, zh | Fixes broken Unicodes (by [ftfy](https://ftfy.readthedocs.io/)) |
+| sentence_split_mapper | General | en | Splits and reorganizes sentences according to semantics |
+| remove_long_words_mapper | General | en, zh | Removes words with length outside the specified range |
+| remove_words_with_incorrect_ substrings_mapper | General | en, zh | Removes words containing specified substrings |
+| clean_email_mapper | General | en, zh | Removes email information |
+| clean_ip_mapper | General | en, zh | Removes IP addresses |
+| clean_links_mapper | General, Code | en, zh | Removes links, such as those starting with http or ftp |
+| clean_html_mapper | General | en, zh | Removes HTML tags and returns plain text of all the nodes |
+| remove_table_text_mapper | General, Financial | en | Detects and removes possible table contents (:warning: relies on regular expression matching and thus fragile) |
+| clean_copyright_mapper | Code | en, zh | Removes copyright notice at the beginning of code files (:warning: must contain the word *copyright*) |
+| remove_specific_chars_mapper | General | en, zh | Removes any user-specified characters or substrings |
+
+
+## Filter
+
+| Operator | Domain | Lang | Description |
+|--------------------------------|---------|--------|--------------------------------------------------------------------------------------------|
+| word_num_filter | General | en, zh | Keeps samples with word count within the specified range |
+| stopwords_filter | General | en, zh | Keeps samples with stopword ratio above the specified threshold |
+| flagged_words_filter | General | en, zh | Keeps samples with flagged-word ratio below the specified threshold |
+| character_repetition_filter | General | en, zh | Keeps samples with char-level n-gram repetition ratio within the specified range |
+| word_repetition_filter | General | en, zh | Keeps samples with word-level n-gram repetition ratio within the specified range |
+| special_characters_filter | General | en, zh | Keeps samples with special-char ratio within the specified range |
+| language_id_score_filter | General | en, zh | Keeps samples of the specified language, judged by a predicted confidence score |
+| perplexity_filter | General | en, zh | Keeps samples with perplexity score below the specified threshold |
+| maximum_line_length_filter | Code | en, zh | Keeps samples with maximum line length within the specified range |
+| average_line_length_filter | Code | en, zh | Keeps samples with average line length within the specified range |
+| alphanumeric_filter | General | en, zh | Keeps samples with alphanumeric ratio within the specified range |
+| text_length_filter | General | en, zh | Keeps samples with total text length within the specified range |
+| suffix_filter | General | en, zh | Keeps samples with specified suffixes |
+| specified_field_filter | General | en, zh | Filters samples based on field, with value lies in the specified targets |
+| specified_numeric_field_filter | General | en, zh | Filters samples based on field, with value lies in the specified range (for numeric types) |
+
+
+## Deduplicator
+
+| Operator | Domain | Lang | Description |
+|-------------------------------|---------|--------|-------------------------------------------------------------|
+| document_deduplicator | General | en, zh | Deduplicate samples at document-level by comparing MD5 hash |
+| document_minhash_deduplicator | General | en, zh | Deduplicate samples at document-level using MinHashLSH |
+| document_simhash_deduplicator | General | en, zh | Deduplicate samples at document-level using SimHash |
+
+
+## Selector
+
+| Operator | Domain | Lang | Description |
+|------------------------------------|---------|--------|-----------------------------------------------------------------------|
+| topk_specified_field_selector | General | en, zh | Selects top samples by comparing the values of the specified field |
+| frequency_specified_field_selector | General | en, zh | Selects top samples by comparing the frequency of the specified field |
+
+
+## Contributing
+We welcome contributions of adding new operators. Please refer to [How-to Guide for Developers](DeveloperGuide.md).
diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
new file mode 100644
index 000000000..b76bc944d
--- /dev/null
+++ b/docs/Operators_ZH.md
@@ -0,0 +1,98 @@
+# 算子提要
+
+算子 (Operator) 是协助数据修改、清理、过滤、去重等基本流程的集合。我们支持广泛的数据来源和文件格式,并支持对自定义数据集的灵活扩展。
+
+## 概览
+
+Data-Juicer 中的算子分为以下 5 种类型。
+
+| 类型 | 数量 | 描述 |
+|-----------------------------------|:------:|-------------|
+| [ Formatter ]( #formatter ) | 7 | 发现、加载、规范化原始数据 |
+| [ Mapper ]( #mapper ) | 17 | 对数据样本进行编辑和转换 |
+| [ Filter ]( #filter ) | 15 | 过滤低质量样本 |
+| [ Deduplicator ]( #deduplicator ) | 3 | 识别、删除重复样本 |
+| [ Selector ]( #selector ) | 2 | 基于排序选取高质量样本 |
+
+下面列出所有具体算子,每种算子都通过多个标签来注明其主要功能。
+
+* Domain 标签
+ - General: 一般用途
+ - LaTeX: 专用于 LaTeX 源文件
+ - Code: 专用于编程代码
+ - Financial: 与金融领域相关
+* Language 标签
+ - en: 英文
+ - zh: 中文
+
+
+## Formatter
+
+| 算子 | 场景 | 语言 | 描述 |
+|-------------------|---------|--------|--------------------------------------------------------------------|
+| remote_formatter | General | en, zh | 准备远端数据集 (如 HuggingFace) |
+| csv_formatter | General | en, zh | 准备本地 `.csv` 文件 |
+| tsv_formatter | General | en, zh | 准备本地 `.tsv` 文件 |
+| json_formatter | General | en, zh | 准备本地 `.json`, `.jsonl`, `.jsonl.zst` 文件 |
+| parquet_formatter | General | en, zh | 准备本地 `.parquet` 文件 |
+| text_formatter | General | en, zh | 准备其他本地文本文件([完整的支持列表](data_juicer/format/text_formatter.py#L46,56)) |
+| mixture_formatter | General | en, zh | 处理可支持本地文件的混合 |
+
+## Mapper
+
+| 算子 | 场景 | 语言 | 描述 |
+|-----------------------------------------------|--------------------|--------|-------------------------------------------------------------------|
+| remove_header_mapper | LaTeX | en, zh | 删除 TeX 文档头,例如标题、章节数字/名称等 |
+| remove_bibliography_mapper | LaTeX | en, zh | 删除 TeX 文档的参考文献 |
+| expand_macro_mapper | LaTeX | en, zh | 扩展通常在 TeX 文档顶部定义的宏 |
+| whitespace_normalization_mapper | General | en, zh | 将各种 Unicode 空白标准化为常规 ASCII 空格 (U+0020) |
+| punctuation_normalization_mapper | General | en, zh | 将各种 Unicode 标点符号标准化为其 ASCII 等效项 |
+| fix_unicode_mapper | General | en, zh | 修复损坏的 Unicode(借助 [ftfy](https://ftfy.readthedocs.io/)) |
+| sentence_split_mapper | General | en | 根据语义拆分和重组句子 |
+| remove_long_words_mapper | General | en, zh | 删除长度超出指定范围的单词 |
+| remove_words_with_incorrect_ substrings_mapper | General | en, zh | 删除包含指定子字符串的单词 |
+| clean_email_mapper | General | en, zh | 删除邮箱信息 |
+| clean_ip_mapper | General | en, zh | 删除 IP 地址 |
+| clean_links_mapper | General, Code | en, zh | 删除链接,例如以 http 或 ftp 开头的 |
+| clean_html_mapper | General | en, zh | 删除 HTML 标签并返回所有节点的纯文本 |
+| remove_table_text_mapper | General, Financial | en | 检测并删除可能的表格内容(:warning: 依赖正则表达式匹配,因此很脆弱) |
+| clean_copyright_mapper | Code | en, zh | 删除代码文件开头的版权声明 (:warning: 必须包含单词 *copyright*) |
+| remove_specific_chars_mapper | General | en, zh | 删除任何用户指定的字符或子字符串 |
+
+## Filter
+
+| 算子 | 场景 | 语言 | 描述 |
+|--------------------------------|---------|--------|---------------------------------------------------------|
+| word_num_filter | General | en, zh | 保留字数在指定范围内的样本 |
+| stopwords_filter | General | en, zh | 保留停用词比率高于指定阈值的样本 |
+| flagged_words_filter | General | en, zh | 保留使标记字比率保持在指定阈值以下的样本 |
+| character_repetition_filter | General | en, zh | 保留 char-level n-gram 重复比率在指定范围内的样本 |
+| word_repetition_filter | General | en, zh | 保留 word-level n-gram 重复比率在指定范围内的样本 |
+| special_characters_filter | General | en, zh | 保留 special-char 比率的在指定范围内的样本 |
+| language_id_score_filter | General | en, zh | 保留特定语言的样本,通过预测的置信度得分来判断 |
+| perplexity_filter | General | en, zh | 保留困惑度低于指定阈值的样本 |
+| maximum_line_length_filter | Code | en, zh | 保留最大行长度在指定范围内的样本 |
+| average_line_length_filter | Code | en, zh | 保留平均行长度在指定范围内的样本 |
+| alphanumeric_filter | General | en, zh | 保留字母数字比例在指定范围内的样本 |
+| text_length_filter | General | en, zh | 保留总文本长度在指定范围内的样本 |
+| suffix_filter | General | en, zh | 保留包含特定后缀的样本 |
+| specified_field_filter | General | en, zh | 根据字段过滤样本,要求字段的值处于指定目标中 |
+| specified_numeric_field_filter | General | en, zh | 根据字段过滤样本,要求字段的值处于指定范围(针对数字类型) |
+
+## Deduplicator
+
+| 算子 | 场景 | 语言 | 描述 |
+|-------------------------------|---------|--------|----------------------------------------------|
+| document_deduplicator | General | en, zh | 通过比较 MD5 哈希值在文档级别对样本去重 |
+| document_minhash_deduplicator | General | en, zh | 使用 MinHashLSH 在文档级别对样本去重 |
+| document_simhash_deduplicator | General | en, zh | 使用 SimHash 在文档级别对样本去重 |
+
+## Selector
+
+| 算子 | 场景 | 语言 | 描述 |
+|------------------------------------|---------|--------|-----------------------------------------------|
+| topk_specified_field_selector | General | en, zh | 通过比较指定字段的值选出前 k 个样本 |
+| frequency_specified_field_selector | General | en, zh | 通过比较指定字段的频率选出前 k 个样本 |
+
+## 贡献
+我们欢迎社区贡献新的算子,具体请参考[开发者指南](DeveloperGuide_ZH.md)。
diff --git a/docs/config_def.png b/docs/config_def.png
new file mode 100644
index 000000000..2ff77ef65
Binary files /dev/null and b/docs/config_def.png differ
diff --git a/docs/imgs/data-juicer.png b/docs/imgs/data-juicer.png
new file mode 100644
index 000000000..ffa59a3db
Binary files /dev/null and b/docs/imgs/data-juicer.png differ
diff --git a/docs/imgs/eval-01.png b/docs/imgs/eval-01.png
new file mode 100644
index 000000000..382bf2743
Binary files /dev/null and b/docs/imgs/eval-01.png differ
diff --git a/docs/imgs/eval-02.png b/docs/imgs/eval-02.png
new file mode 100644
index 000000000..71ca49bb2
Binary files /dev/null and b/docs/imgs/eval-02.png differ
diff --git a/docs/sphinx_doc/Makefile b/docs/sphinx_doc/Makefile
new file mode 100644
index 000000000..d0c3cbf10
--- /dev/null
+++ b/docs/sphinx_doc/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = source
+BUILDDIR = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/sphinx_doc/README.md b/docs/sphinx_doc/README.md
new file mode 100644
index 000000000..eb4372201
--- /dev/null
+++ b/docs/sphinx_doc/README.md
@@ -0,0 +1,32 @@
+# Data-Juicer Documentation
+
+We build our API documentation with help of Sphinx.
+To update the generated
+doc, please run the following commands:
+
+```bash
+# $~/data_juicer/docs/sphinx_doc
+# 1. install the sphinx requirements and init the sphinx-quickstart
+pip install sphinx sphinx-autobuild sphinx_rtd_theme recommonmark
+# or pip install -r ../../environments/dev_requires
+sphinx-quickstart
+
+# 2. auto generate the doc files for all sub modules (*.rst) from source codes
+sphinx-apidoc -o source ../../data_juicer
+
+# 3. modify the auto-generated files according to your requirements
+vim source/modules.rst
+
+# 4. finalize the doc, which is stored in the `build/html` directory
+make clean
+make html
+mv build/html position_to_publish
+```
+
+- For convenience (you don’t have to compile from scratch again), the built
+ directory (including the html files) can be download as follows:
+```bash
+# cd docs/sphinx_doc
+wget https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/sphinx_API_build_0801.zip
+unzip sphinx_API_build_0801.zip
+```
diff --git a/docs/sphinx_doc/README_ZH.md b/docs/sphinx_doc/README_ZH.md
new file mode 100644
index 000000000..4f57ea167
--- /dev/null
+++ b/docs/sphinx_doc/README_ZH.md
@@ -0,0 +1,31 @@
+# Data-Juicer 文档
+
+Data-Juicer 借助 Sphinx 构建 API 文档。
+如需更新生成的文档,请运行以下命令:
+
+```bash
+# $~/data_juicer/docs/sphinx_doc
+# 1.安装 sphinx 的依赖并初始化 sphinx-quickstart
+pip install sphinx sphinx-autobuild sphinx_rtd_theme recommonmark
+# or pip install -r ../../environments/dev_requires
+sphinx-quickstart
+
+# 2. 从源代码自动生成所有子模块(*.rst)的文档文件
+sphinx-apidoc -o source ../../data_juicer
+
+# 3. 根据您的要求修改自动生成的文件
+vim source/modules.rst
+
+# 4. 完成文档的构建,文档存储目录为 `build/html`
+make clean
+make html
+mv build/html position_to_publish
+```
+
+- 为了方便起见(不必再次从头开始编译),可以按如下方式下载构建的目录(包括 html 文件):
+
+```bash
+# cd docs/sphinx_doc
+wget https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/sphinx_API_build_0801.zip
+unzip sphinx_API_build_0801.zip
+```
diff --git a/docs/sphinx_doc/make.bat b/docs/sphinx_doc/make.bat
new file mode 100644
index 000000000..dc1312ab0
--- /dev/null
+++ b/docs/sphinx_doc/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.https://www.sphinx-doc.org/
+ exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/sphinx_doc/source/conf.py b/docs/sphinx_doc/source/conf.py
new file mode 100644
index 000000000..37aee1f02
--- /dev/null
+++ b/docs/sphinx_doc/source/conf.py
@@ -0,0 +1,42 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = 'data_juicer'
+copyright = '2023, Data-Juicer Team'
+author = 'Data-Juicer Team'
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+import sphinx_rtd_theme
+
+from data_juicer import __version__ as version
+
+release = version
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.viewcode',
+ 'sphinx.ext.napoleon',
+]
+
+templates_path = ['_templates']
+exclude_patterns = ['build']
+
+# -- Options for HTML output -------------------------------------------------
+
+html_theme = 'sphinx_rtd_theme'
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
diff --git a/docs/sphinx_doc/source/data_juicer.analysis.rst b/docs/sphinx_doc/source/data_juicer.analysis.rst
new file mode 100644
index 000000000..e8a6c97a7
--- /dev/null
+++ b/docs/sphinx_doc/source/data_juicer.analysis.rst
@@ -0,0 +1,37 @@
+data\_juicer.analysis package
+=============================
+
+Submodules
+----------
+
+data\_juicer.analysis.column\_wise\_analysis module
+---------------------------------------------------
+
+.. automodule:: data_juicer.analysis.column_wise_analysis
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.analysis.diversity\_analysis module
+------------------------------------------------
+
+.. automodule:: data_juicer.analysis.diversity_analysis
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.analysis.overall\_analysis module
+----------------------------------------------
+
+.. automodule:: data_juicer.analysis.overall_analysis
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: data_juicer.analysis
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/sphinx_doc/source/data_juicer.config.rst b/docs/sphinx_doc/source/data_juicer.config.rst
new file mode 100644
index 000000000..9b7293596
--- /dev/null
+++ b/docs/sphinx_doc/source/data_juicer.config.rst
@@ -0,0 +1,21 @@
+data\_juicer.config package
+===========================
+
+Submodules
+----------
+
+data\_juicer.config.config module
+---------------------------------
+
+.. automodule:: data_juicer.config.config
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: data_juicer.config
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/sphinx_doc/source/data_juicer.core.rst b/docs/sphinx_doc/source/data_juicer.core.rst
new file mode 100644
index 000000000..858d271ca
--- /dev/null
+++ b/docs/sphinx_doc/source/data_juicer.core.rst
@@ -0,0 +1,53 @@
+data\_juicer.core package
+=========================
+
+Submodules
+----------
+
+data\_juicer.core.analyser module
+---------------------------------
+
+.. automodule:: data_juicer.core.analyser
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.core.data module
+-----------------------------
+
+.. automodule:: data_juicer.core.data
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.core.executor module
+---------------------------------
+
+.. automodule:: data_juicer.core.executor
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.core.exporter module
+---------------------------------
+
+.. automodule:: data_juicer.core.exporter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.core.tracer module
+-------------------------------
+
+.. automodule:: data_juicer.core.tracer
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: data_juicer.core
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/sphinx_doc/source/data_juicer.format.rst b/docs/sphinx_doc/source/data_juicer.format.rst
new file mode 100644
index 000000000..575a5b16a
--- /dev/null
+++ b/docs/sphinx_doc/source/data_juicer.format.rst
@@ -0,0 +1,77 @@
+data\_juicer.format package
+===========================
+
+Submodules
+----------
+
+data\_juicer.format.csv\_formatter module
+-----------------------------------------
+
+.. automodule:: data_juicer.format.csv_formatter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.format.formatter module
+------------------------------------
+
+.. automodule:: data_juicer.format.formatter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.format.json\_formatter module
+------------------------------------------
+
+.. automodule:: data_juicer.format.json_formatter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.format.load module
+-------------------------------
+
+.. automodule:: data_juicer.format.load
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.format.mixture\_formatter module
+---------------------------------------------
+
+.. automodule:: data_juicer.format.mixture_formatter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.format.parquet\_formatter module
+---------------------------------------------
+
+.. automodule:: data_juicer.format.parquet_formatter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.format.text\_formatter module
+------------------------------------------
+
+.. automodule:: data_juicer.format.text_formatter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.format.tsv\_formatter module
+-----------------------------------------
+
+.. automodule:: data_juicer.format.tsv_formatter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: data_juicer.format
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/sphinx_doc/source/data_juicer.ops.common.rst b/docs/sphinx_doc/source/data_juicer.ops.common.rst
new file mode 100644
index 000000000..be34ff5bf
--- /dev/null
+++ b/docs/sphinx_doc/source/data_juicer.ops.common.rst
@@ -0,0 +1,29 @@
+data\_juicer.ops.common package
+===============================
+
+Submodules
+----------
+
+data\_juicer.ops.common.helper\_func module
+-------------------------------------------
+
+.. automodule:: data_juicer.ops.common.helper_func
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.common.special\_characters module
+--------------------------------------------------
+
+.. automodule:: data_juicer.ops.common.special_characters
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: data_juicer.ops.common
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/sphinx_doc/source/data_juicer.ops.deduplicator.rst b/docs/sphinx_doc/source/data_juicer.ops.deduplicator.rst
new file mode 100644
index 000000000..d30ce1dad
--- /dev/null
+++ b/docs/sphinx_doc/source/data_juicer.ops.deduplicator.rst
@@ -0,0 +1,37 @@
+data\_juicer.ops.deduplicator package
+=====================================
+
+Submodules
+----------
+
+data\_juicer.ops.deduplicator.document\_deduplicator module
+-----------------------------------------------------------
+
+.. automodule:: data_juicer.ops.deduplicator.document_deduplicator
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.deduplicator.document\_minhash\_deduplicator module
+--------------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.deduplicator.document_minhash_deduplicator
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.deduplicator.document\_simhash\_deduplicator module
+--------------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.deduplicator.document_simhash_deduplicator
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: data_juicer.ops.deduplicator
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/sphinx_doc/source/data_juicer.ops.filter.rst b/docs/sphinx_doc/source/data_juicer.ops.filter.rst
new file mode 100644
index 000000000..64e449177
--- /dev/null
+++ b/docs/sphinx_doc/source/data_juicer.ops.filter.rst
@@ -0,0 +1,133 @@
+data\_juicer.ops.filter package
+===============================
+
+Submodules
+----------
+
+data\_juicer.ops.filter.alphanumeric\_filter module
+---------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.alphanumeric_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.average\_line\_length\_filter module
+------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.average_line_length_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.character\_repetition\_filter module
+------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.character_repetition_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.flagged\_words\_filter module
+-----------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.flagged_words_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.language\_id\_score\_filter module
+----------------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.language_id_score_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.maximum\_line\_length\_filter module
+------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.maximum_line_length_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.perplexity\_filter module
+-------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.perplexity_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.special\_characters\_filter module
+----------------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.special_characters_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.specified\_field\_filter module
+-------------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.specified_field_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.specified\_numeric\_field\_filter module
+----------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.specified_numeric_field_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.stopwords\_filter module
+------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.stopwords_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.suffix\_filter module
+---------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.suffix_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.text\_length\_filter module
+---------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.text_length_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.word\_num\_filter module
+------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.word_num_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.filter.word\_repetition\_filter module
+-------------------------------------------------------
+
+.. automodule:: data_juicer.ops.filter.word_repetition_filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: data_juicer.ops.filter
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/sphinx_doc/source/data_juicer.ops.mapper.rst b/docs/sphinx_doc/source/data_juicer.ops.mapper.rst
new file mode 100644
index 000000000..c8688614b
--- /dev/null
+++ b/docs/sphinx_doc/source/data_juicer.ops.mapper.rst
@@ -0,0 +1,149 @@
+data\_juicer.ops.mapper package
+===============================
+
+Submodules
+----------
+
+data\_juicer.ops.mapper.clean\_copyright\_mapper module
+-------------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.clean_copyright_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.clean\_email\_mapper module
+---------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.clean_email_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.clean\_html\_mapper module
+--------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.clean_html_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.clean\_ip\_mapper module
+------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.clean_ip_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.clean\_links\_mapper module
+---------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.clean_links_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.expand\_macro\_mapper module
+----------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.expand_macro_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.fix\_unicode\_mapper module
+---------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.fix_unicode_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.punctuation\_normalization\_mapper module
+-----------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.punctuation_normalization_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.remove\_bibliography\_mapper module
+-----------------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.remove_bibliography_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.remove\_comments\_mapper module
+-------------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.remove_comments_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.remove\_header\_mapper module
+-----------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.remove_header_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.remove\_long\_words\_mapper module
+----------------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.remove_long_words_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.remove\_specific\_chars\_mapper module
+--------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.remove_specific_chars_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.remove\_table\_text\_mapper module
+----------------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.remove_table_text_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.remove\_words\_with\_incorrect\_substrings\_mapper module
+---------------------------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.sentence\_split\_mapper module
+------------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.sentence_split_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.mapper.whitespace\_normalization\_mapper module
+----------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.whitespace_normalization_mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: data_juicer.ops.mapper
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/sphinx_doc/source/data_juicer.ops.rst b/docs/sphinx_doc/source/data_juicer.ops.rst
new file mode 100644
index 000000000..f25068b50
--- /dev/null
+++ b/docs/sphinx_doc/source/data_juicer.ops.rst
@@ -0,0 +1,41 @@
+data\_juicer.ops package
+========================
+
+Subpackages
+-----------
+
+.. toctree::
+ :maxdepth: 4
+
+ data_juicer.ops.common
+ data_juicer.ops.deduplicator
+ data_juicer.ops.filter
+ data_juicer.ops.mapper
+ data_juicer.ops.selector
+
+Submodules
+----------
+
+data\_juicer.ops.base\_op module
+--------------------------------
+
+.. automodule:: data_juicer.ops.base_op
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.load module
+----------------------------
+
+.. automodule:: data_juicer.ops.load
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: data_juicer.ops
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/sphinx_doc/source/data_juicer.ops.selector.rst b/docs/sphinx_doc/source/data_juicer.ops.selector.rst
new file mode 100644
index 000000000..266b47408
--- /dev/null
+++ b/docs/sphinx_doc/source/data_juicer.ops.selector.rst
@@ -0,0 +1,29 @@
+data\_juicer.ops.selector package
+=================================
+
+Submodules
+----------
+
+data\_juicer.ops.selector.frequency\_specified\_field\_selector module
+----------------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.selector.frequency_specified_field_selector
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.ops.selector.topk\_specified\_field\_selector module
+-----------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.selector.topk_specified_field_selector
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: data_juicer.ops.selector
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/sphinx_doc/source/data_juicer.rst b/docs/sphinx_doc/source/data_juicer.rst
new file mode 100644
index 000000000..c305d1dd0
--- /dev/null
+++ b/docs/sphinx_doc/source/data_juicer.rst
@@ -0,0 +1,23 @@
+data\_juicer package
+====================
+
+Subpackages
+-----------
+
+.. toctree::
+ :maxdepth: 4
+
+ data_juicer.analysis
+ data_juicer.config
+ data_juicer.core
+ data_juicer.format
+ data_juicer.ops
+ data_juicer.utils
+
+Module contents
+---------------
+
+.. automodule:: data_juicer
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/sphinx_doc/source/data_juicer.utils.rst b/docs/sphinx_doc/source/data_juicer.utils.rst
new file mode 100644
index 000000000..65b8d1208
--- /dev/null
+++ b/docs/sphinx_doc/source/data_juicer.utils.rst
@@ -0,0 +1,69 @@
+data\_juicer.utils package
+==========================
+
+Submodules
+----------
+
+data\_juicer.utils.asset\_utils module
+--------------------------------------
+
+.. automodule:: data_juicer.utils.asset_utils
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.utils.cache\_utils module
+--------------------------------------
+
+.. automodule:: data_juicer.utils.cache_utils
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.utils.ckpt\_utils module
+-------------------------------------
+
+.. automodule:: data_juicer.utils.ckpt_utils
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.utils.file\_utils module
+-------------------------------------
+
+.. automodule:: data_juicer.utils.file_utils
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.utils.logger\_utils module
+---------------------------------------
+
+.. automodule:: data_juicer.utils.logger_utils
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.utils.model\_utils module
+--------------------------------------
+
+.. automodule:: data_juicer.utils.model_utils
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+data\_juicer.utils.registry module
+----------------------------------
+
+.. automodule:: data_juicer.utils.registry
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: data_juicer.utils
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/sphinx_doc/source/index.rst b/docs/sphinx_doc/source/index.rst
new file mode 100644
index 000000000..9c098d834
--- /dev/null
+++ b/docs/sphinx_doc/source/index.rst
@@ -0,0 +1,21 @@
+.. data-juicer documentation master file, created by
+ sphinx-quickstart on Mon May 22 16:16:12 2023.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+Welcome to data-juicer's documentation!
+=======================================
+
+.. toctree::
+ :maxdepth: 2
+ :caption: References:
+
+.. include:: modules.rst
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/sphinx_doc/source/modules.rst b/docs/sphinx_doc/source/modules.rst
new file mode 100644
index 000000000..2845759f3
--- /dev/null
+++ b/docs/sphinx_doc/source/modules.rst
@@ -0,0 +1,7 @@
+data_juicer
+===========
+
+.. toctree::
+ :maxdepth: 4
+
+ data_juicer
diff --git a/environments/dev_requires.txt b/environments/dev_requires.txt
new file mode 100644
index 000000000..ff091a304
--- /dev/null
+++ b/environments/dev_requires.txt
@@ -0,0 +1,5 @@
+pre-commit
+sphinx
+sphinx-autobuild
+sphinx_rtd_theme
+recommonmark
diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt
new file mode 100644
index 000000000..f1917aad3
--- /dev/null
+++ b/environments/minimal_requires.txt
@@ -0,0 +1,17 @@
+datasets==2.11.0
+loguru
+tqdm
+jsonargparse[signatures]
+matplotlib
+pandas
+requests
+wget
+zstandard
+pdfplumber
+python-docx
+streamlit
+spacy==3.5.0
+en_core_web_md @ https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/models/en_core_web_md-3.5.0-py3-none-any.whl
+zh_core_web_md @ https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/models/zh_core_web_md-3.5.0-py3-none-any.whl
+multiprocess==0.70.12
+dill==0.3.4
diff --git a/environments/preprocess_requires.txt b/environments/preprocess_requires.txt
new file mode 100644
index 000000000..657e1936b
--- /dev/null
+++ b/environments/preprocess_requires.txt
@@ -0,0 +1,2 @@
+fire
+jsonlines
diff --git a/environments/quality_classifier_requires.txt b/environments/quality_classifier_requires.txt
new file mode 100644
index 000000000..e7b76ed45
--- /dev/null
+++ b/environments/quality_classifier_requires.txt
@@ -0,0 +1,3 @@
+pyspark
+fire
+wget
diff --git a/environments/science_requires.txt b/environments/science_requires.txt
new file mode 100644
index 000000000..d0410bd57
--- /dev/null
+++ b/environments/science_requires.txt
@@ -0,0 +1,13 @@
+fasttext
+kenlm @ http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/kenlm-master.zip
+sentencepiece
+scipy
+tabulate
+pandas
+ftfy
+emoji==2.2.0
+regex
+simhash-py
+selectolax
+nltk
+transformers
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 000000000..ebc6dcad3
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,3 @@
+[flake8]
+per-file-ignores =
+ */__init__.py: F401
diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000..69a2bb3b1
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,69 @@
+import logging
+import os.path
+import re
+
+import setuptools
+
+
+def get_package_dir():
+ pkg_dir = {
+ 'data_juicer.tools': 'tools',
+ }
+ return pkg_dir
+
+
+def get_install_requirements(require_f_paths, env_dir='environments'):
+ reqs = []
+ for path in require_f_paths:
+ target_f = os.path.join(env_dir, path)
+ if not os.path.exists(target_f):
+ logging.warning(f'target file does not exist: {target_f}')
+ else:
+ with open(target_f, 'r', encoding='utf-8') as fin:
+ reqs += [x.strip() for x in fin.read().splitlines()]
+ reqs = [x for x in reqs if not x.startswith('#')]
+ return reqs
+
+
+# allowing selective installment based on users' needs
+# TODO: The specific taxonomy and dependencies will be determined
+# after implementing some preliminary operators and detailed discussions
+min_requires = get_install_requirements(
+ ['minimal_requires.txt', 'science_requires.txt'])
+extra_requires = {
+ 'mini':
+ min_requires,
+ 'dev':
+ get_install_requirements(['dev_requires.txt']),
+ 'tools':
+ get_install_requirements(
+ ['preprocess_requires.txt', 'quality_classifier_requires.txt']),
+}
+extra_requires['all'] = [v for v in extra_requires.values()]
+
+with open('data_juicer/__init__.py', 'r') as f:
+ version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(),
+ re.MULTILINE).group(1)
+
+with open('README.md', encoding='utf-8') as f:
+ readme_md = f.read()
+
+setuptools.setup(
+ name='data_juicer',
+ version=version,
+ author='SysML team of Alibaba DAMO Academy',
+ description='A Data-Centric Text Processing System for Large Language '
+ 'Models.',
+ long_description=readme_md,
+ long_description_content_type='text/markdown',
+ license='Apache License 2.0',
+ packages=setuptools.find_packages(),
+ package_dir=get_package_dir(),
+ install_requires=min_requires,
+ extras_require=extra_requires,
+ classifiers=[
+ 'License :: OSI Approved :: Apache Software License',
+ 'Programming Language :: Python :: 3',
+ 'Operating System :: OS Independent'
+ ],
+)
diff --git a/tests/.DS_Store b/tests/.DS_Store
new file mode 100644
index 000000000..35f1b0501
Binary files /dev/null and b/tests/.DS_Store differ
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/analysis/__init__.py b/tests/analysis/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/config/__init__.py b/tests/config/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/config/demo_4_test.yaml b/tests/config/demo_4_test.yaml
new file mode 100644
index 000000000..39d11fd8f
--- /dev/null
+++ b/tests/config/demo_4_test.yaml
@@ -0,0 +1,18 @@
+# Process config example for Arxiv dataset
+
+# global parameters
+project_name: 'test_demo'
+dataset_path: './demo/demo-dataset.jsonl' # path to your dataset directory or file
+np: 4 # number of subprocess to process your dataset
+
+export_path: './outputs/demo/demo-processed.parquet'
+
+# process schedule
+# a list of several process operators with their arguments
+process:
+ - whitespace_normalization_mapper:
+ - language_id_score_filter:
+ lang: 'zh'
+ - document_deduplicator: # deduplicate text samples using md5 hashing exact matching method
+ lowercase: false # whether to convert text to lower case
+ ignore_non_character: false
diff --git a/tests/config/test_config_funcs.py b/tests/config/test_config_funcs.py
new file mode 100644
index 000000000..74996cc6a
--- /dev/null
+++ b/tests/config/test_config_funcs.py
@@ -0,0 +1,115 @@
+import os
+import unittest
+from contextlib import redirect_stdout
+from io import StringIO
+
+from jsonargparse import Namespace
+
+from data_juicer.config import init_configs
+from data_juicer.ops import load_ops
+
+test_yaml_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+ 'demo_4_test.yaml')
+
+
+class ConfigTest(unittest.TestCase):
+
+ def test_help_info(self):
+ out = StringIO()
+ with redirect_stdout(out), self.assertRaises(SystemExit):
+ _ = init_configs(args=['--help'])
+ out_str = out.getvalue()
+ self.assertIn('usage:', out_str, 'lacks message for command beginning')
+ self.assertIn('--config CONFIG', out_str,
+ 'lacks message for positional argument')
+ self.assertIn('[--project_name PROJECT_NAME]', out_str,
+ 'lacks message for optional argument')
+ self.assertIn(
+ 'Number of processes to process dataset. (type:', out_str,
+ 'the help message of `np` argument does not show as expected')
+
+ def test_yaml_cfg_file(self):
+ out = StringIO()
+ with redirect_stdout(out):
+ cfg = init_configs(args=f'--config {test_yaml_path}'.split())
+ self.assertIsInstance(cfg, Namespace)
+ self.assertEqual(cfg.project_name, 'test_demo')
+ self.assertDictEqual(
+ cfg.process[0],
+ {'whitespace_normalization_mapper': {
+ 'text_key': None
+ }}, 'nested dict load fail, for nonparametric op')
+ self.assertDictEqual(
+ cfg.process[1], {
+ 'language_id_score_filter': {
+ 'lang': 'zh',
+ 'min_score': 0.8,
+ 'text_key': None
+ }
+ }, 'nested dict load fail, un-expected internal value')
+
+ op_from_cfg = load_ops(cfg.process, cfg.text_key_to_process)
+ self.assertTrue(len(op_from_cfg) == 3)
+
+ def test_mixture_cfg(self):
+ out = StringIO()
+ with redirect_stdout(out):
+ ori_cfg = init_configs(args=f'--config {test_yaml_path}'.split())
+ mixed_cfg_1 = init_configs(
+ args=f'--config {test_yaml_path} '
+ '--language_id_score_filter.lang en'.split())
+ mixed_cfg_2 = init_configs(
+ args=f'--config {test_yaml_path} '
+ '--language_id_score_filter.lang=fr'.split())
+ mixed_cfg_3 = init_configs(
+ args=f'--config {test_yaml_path} '
+ '--language_id_score_filter.lang zh '
+ '--language_id_score_filter.min_score 0.6'.split())
+ mixed_cfg_4 = init_configs(
+ args=f'--config {test_yaml_path} '
+ '--language_id_score_filter.lang=en '
+ '--language_id_score_filter.min_score=0.5'.split())
+ self.assertDictEqual(
+ ori_cfg.process[1], {
+ 'language_id_score_filter': {
+ 'lang': 'zh',
+ 'min_score': 0.8,
+ 'text_key': None
+ }
+ })
+ self.assertDictEqual(
+ mixed_cfg_1.process[1], {
+ 'language_id_score_filter': {
+ 'lang': 'en',
+ 'min_score': 0.8,
+ 'text_key': None
+ }
+ })
+ self.assertDictEqual(
+ mixed_cfg_2.process[1], {
+ 'language_id_score_filter': {
+ 'lang': 'fr',
+ 'min_score': 0.8,
+ 'text_key': None
+ }
+ })
+ self.assertDictEqual(
+ mixed_cfg_3.process[1], {
+ 'language_id_score_filter': {
+ 'lang': 'zh',
+ 'min_score': 0.6,
+ 'text_key': None
+ }
+ })
+ self.assertDictEqual(
+ mixed_cfg_4.process[1], {
+ 'language_id_score_filter': {
+ 'lang': 'en',
+ 'min_score': 0.5,
+ 'text_key': None
+ }
+ })
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/core/__init__.py b/tests/core/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/format/__init__.py b/tests/format/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/format/data/structured/demo-dataset.csv b/tests/format/data/structured/demo-dataset.csv
new file mode 100644
index 000000000..3ead57857
--- /dev/null
+++ b/tests/format/data/structured/demo-dataset.csv
@@ -0,0 +1,7 @@
+text,meta
+Today is Sunday and it's a happy day!,"{'src': 'Arxiv', 'date': datetime.datetime(2023, 4, 27, 0, 0), 'version': '1.0', 'author': None}"
+Do you need a cup of coffee?,"{'src': 'code', 'date': None, 'version': None, 'author': 'xxx'}"
+你好,请问你是谁,"{'src': 'customized', 'date': None, 'version': None, 'author': 'xxx'}"
+"Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.","{'src': 'Oscar', 'date': None, 'version': '2.0', 'author': None}"
+欢迎来到阿里巴巴!,"{'src': 'customized', 'date': None, 'version': '0.1', 'author': 'xxx'}"
+This paper proposed a novel method on LLM pretraining.,"{'src': 'customized', 'date': None, 'version': None, 'author': 'xxx'}"
diff --git a/tests/format/data/structured/demo-dataset.jsonl b/tests/format/data/structured/demo-dataset.jsonl
new file mode 100644
index 000000000..707f802b0
--- /dev/null
+++ b/tests/format/data/structured/demo-dataset.jsonl
@@ -0,0 +1,2 @@
+{"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}}
+{"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}}
diff --git a/tests/format/data/structured/demo-dataset.parquet b/tests/format/data/structured/demo-dataset.parquet
new file mode 100644
index 000000000..57ea0c38e
Binary files /dev/null and b/tests/format/data/structured/demo-dataset.parquet differ
diff --git a/tests/format/data/structured/demo-dataset.tsv b/tests/format/data/structured/demo-dataset.tsv
new file mode 100644
index 000000000..2cc07067a
--- /dev/null
+++ b/tests/format/data/structured/demo-dataset.tsv
@@ -0,0 +1,7 @@
+text meta
+Today is Sunday and it's a happy day! {'src': 'Arxiv', 'date': datetime.datetime(2023, 4, 27, 0, 0), 'version': '1.0', 'author': None}
+Do you need a cup of coffee? {'src': 'code', 'date': None, 'version': None, 'author': 'xxx'}
+你好,请问你是谁 {'src': 'customized', 'date': None, 'version': None, 'author': 'xxx'}
+Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément. {'src': 'Oscar', 'date': None, 'version': '2.0', 'author': None}
+欢迎来到阿里巴巴! {'src': 'customized', 'date': None, 'version': '0.1', 'author': 'xxx'}
+This paper proposed a novel method on LLM pretraining. {'src': 'customized', 'date': None, 'version': None, 'author': 'xxx'}
diff --git a/tests/format/data/text/sample1.txt b/tests/format/data/text/sample1.txt
new file mode 100644
index 000000000..698ad7c54
--- /dev/null
+++ b/tests/format/data/text/sample1.txt
@@ -0,0 +1 @@
+Today is Sunday and it's a happy day!
diff --git a/tests/format/data/text/sample2.txt b/tests/format/data/text/sample2.txt
new file mode 100644
index 000000000..5d6227b09
--- /dev/null
+++ b/tests/format/data/text/sample2.txt
@@ -0,0 +1 @@
+Do you need a cup of coffee?
diff --git a/tests/format/data/text/sample3.txt b/tests/format/data/text/sample3.txt
new file mode 100644
index 000000000..78dc2d5ad
--- /dev/null
+++ b/tests/format/data/text/sample3.txt
@@ -0,0 +1 @@
+你好,请问你是谁
diff --git a/tests/format/data/text/sample4.txt b/tests/format/data/text/sample4.txt
new file mode 100644
index 000000000..704306740
--- /dev/null
+++ b/tests/format/data/text/sample4.txt
@@ -0,0 +1 @@
+Sur la plateforme MT4, plusieurs manières d'accéder à ces fonctionnalités sont conçues simultanément.
diff --git a/tests/format/data/text/sample5.txt b/tests/format/data/text/sample5.txt
new file mode 100644
index 000000000..0390b9676
--- /dev/null
+++ b/tests/format/data/text/sample5.txt
@@ -0,0 +1 @@
+欢迎来到阿里巴巴!
diff --git a/tests/format/data/text/sample6.txt b/tests/format/data/text/sample6.txt
new file mode 100644
index 000000000..ea375cee5
--- /dev/null
+++ b/tests/format/data/text/sample6.txt
@@ -0,0 +1 @@
+This paper proposed a novel method on LLM pretraining.
diff --git a/tests/format/test_csv_formatter.py b/tests/format/test_csv_formatter.py
new file mode 100644
index 000000000..7bb99d978
--- /dev/null
+++ b/tests/format/test_csv_formatter.py
@@ -0,0 +1,28 @@
+import os
+import unittest
+
+from data_juicer.format.csv_formatter import CsvFormatter
+
+
+class CsvFormatterTest(unittest.TestCase):
+
+ def setUp(self):
+ self._path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'structured')
+ self._file = os.path.join(self._path, 'demo-dataset.csv')
+ print(self._file)
+
+ def test_csv_file(self):
+ formatter = CsvFormatter(self._file)
+ ds = formatter.load_dataset()
+ self.assertEqual(len(ds), 6)
+ self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
+
+ def test_csv_path(self):
+ formatter = CsvFormatter(self._path)
+ ds = formatter.load_dataset()
+ self.assertEqual(len(ds), 6)
+ self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/format/test_parquet_formatter.py b/tests/format/test_parquet_formatter.py
new file mode 100644
index 000000000..ddd7b80d7
--- /dev/null
+++ b/tests/format/test_parquet_formatter.py
@@ -0,0 +1,28 @@
+import os
+import unittest
+
+from data_juicer.format.parquet_formatter import ParquetFormatter
+
+
+class CsvFormatterTest(unittest.TestCase):
+
+ def setUp(self):
+ self._path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'structured')
+ self._file = os.path.join(self._path, 'demo-dataset.parquet')
+ print(self._file)
+
+ def test_parquet_file(self):
+ formatter = ParquetFormatter(self._file)
+ ds = formatter.load_dataset()
+ self.assertEqual(len(ds), 6)
+ self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
+
+ def test_parquet_path(self):
+ formatter = ParquetFormatter(self._path)
+ ds = formatter.load_dataset()
+ self.assertEqual(len(ds), 6)
+ self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/format/test_tsv_formatter.py b/tests/format/test_tsv_formatter.py
new file mode 100644
index 000000000..5f6da8a78
--- /dev/null
+++ b/tests/format/test_tsv_formatter.py
@@ -0,0 +1,28 @@
+import os
+import unittest
+
+from data_juicer.format.tsv_formatter import TsvFormatter
+
+
+class TsvFormatterTest(unittest.TestCase):
+
+ def setUp(self):
+ self._path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'structured')
+ self._file = os.path.join(self._path, 'demo-dataset.tsv')
+ print(self._file)
+
+ def test_tsv_file(self):
+ formatter = TsvFormatter(self._file)
+ ds = formatter.load_dataset()
+ self.assertEqual(len(ds), 6)
+ self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
+
+ def test_tsv_path(self):
+ formatter = TsvFormatter(self._path)
+ ds = formatter.load_dataset()
+ self.assertEqual(len(ds), 6)
+ self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/format/test_unify_format.py b/tests/format/test_unify_format.py
new file mode 100644
index 000000000..03a55e391
--- /dev/null
+++ b/tests/format/test_unify_format.py
@@ -0,0 +1,447 @@
+import os
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.format.formatter import load_dataset, unify_format
+
+
+class UnifyFormatTest(unittest.TestCase):
+
+ def run_test(self, sample, args=None):
+ if args is None:
+ args = {}
+ ds = Dataset.from_list(sample['source'])
+ ds = unify_format(ds, **args)
+ self.assertEqual(ds.to_list(), sample['target'])
+
+ def test_text_key(self):
+ samples = [
+ {
+ 'source': [{
+ 'text': 'This is a test text',
+ 'outer_key': 1,
+ }],
+ 'target': [{
+ 'text': 'This is a test text',
+ 'meta.outer_key': 1,
+ }]
+ },
+ {
+ 'source': [{
+ 'content': 'This is a test text',
+ 'outer_key': 1,
+ }],
+ 'target': [{
+ 'text': 'This is a test text',
+ 'meta.outer_key': 1,
+ }]
+ },
+ {
+ 'source': [{
+ 'input': 'This is a test text, input part',
+ 'instruction': 'This is a test text, instruction part',
+ 'outer_key': 1,
+ }],
+ 'target': [{
+ 'text.input': 'This is a test text, input part',
+ 'text.instruction':
+ 'This is a test text, instruction part',
+ 'meta.outer_key': 1,
+ }]
+ },
+ ]
+ self.run_test(samples[0])
+ self.run_test(samples[1], args={'text_keys_to_load': ['content']})
+ self.run_test(samples[2],
+ args={'text_keys_to_load': ['input', 'instruction']})
+
+ def test_empty_text(self):
+ # filter out samples containing None field, but '' is OK
+ samples = [
+ {
+ 'source': [{
+ 'text': '',
+ 'outer_key': 1,
+ }],
+ 'target': [{
+ 'text': '',
+ 'meta.outer_key': 1,
+ }],
+ },
+ {
+ 'source': [{
+ 'text': None,
+ 'outer_key': 1,
+ }],
+ 'target': [],
+ },
+ ]
+ for sample in samples:
+ self.run_test(sample)
+
+ def test_no_extra_fields(self):
+ samples = [{
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'stats': {
+ 'lang': 'en'
+ },
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'stats': {
+ 'lang': 'en'
+ },
+ }],
+ }, {
+ 'source': [{
+ 'text': 'This is a test text.',
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ }],
+ }]
+ for sample in samples:
+ self.run_test(sample)
+
+ def test_no_extra_fields_except_meta(self):
+ samples = [{
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'meta': {
+ 'version': 1
+ },
+ 'stats': {
+ 'lang': 'en'
+ },
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta': {
+ 'version': 1
+ },
+ 'stats': {
+ 'lang': 'en'
+ },
+ }],
+ }, {
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'meta': {
+ 'version': 1
+ },
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta': {
+ 'version': 1
+ },
+ }],
+ }]
+ for sample in samples:
+ self.run_test(sample)
+
+ def test_invalid_stats(self):
+ # non-dict stats will be unified into meta.stats
+ samples = [{
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'stats': 'nice',
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta.stats': 'nice'
+ }],
+ }, {
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'stats': {
+ 'version': 1
+ },
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'stats': {
+ 'version': 1
+ },
+ }],
+ }]
+ for sample in samples:
+ self.run_test(sample)
+
+ def test_outer_fields(self):
+ samples = [
+ {
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'meta': {
+ 'meta_inner': 'nice'
+ },
+ 'outer_field': 'value'
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta': {
+ 'meta_inner': 'nice',
+ },
+ 'meta.outer_field': 'value',
+ }],
+ },
+ {
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'outer_key': 'nice',
+ 'outer_field': 'value'
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta.outer_key': 'nice',
+ 'meta.outer_field': 'value',
+ }],
+ },
+ {
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'meta': 'nice',
+ 'outer_field': 'value'
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta': 'nice',
+ 'meta.outer_field': 'value',
+ }],
+ },
+ {
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'meta': {
+ 'meta_inner': 'nice'
+ },
+ 'outer_field': 'value',
+ 'stats': {
+ 'lang': 'en'
+ },
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta': {
+ 'meta_inner': 'nice'
+ },
+ 'meta.outer_field': 'value',
+ 'stats': {
+ 'lang': 'en'
+ },
+ }],
+ },
+ {
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'outer_key': 'nice',
+ 'outer_field': 'value',
+ 'stats': {
+ 'lang': 'en'
+ },
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta.outer_key': 'nice',
+ 'meta.outer_field': 'value',
+ 'stats': {
+ 'lang': 'en'
+ },
+ }],
+ },
+ {
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'meta': 'nice',
+ 'outer_field': 'value',
+ 'stats': {
+ 'lang': 'en'
+ },
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta': 'nice',
+ 'meta.outer_field': 'value',
+ 'stats': {
+ 'lang': 'en'
+ },
+ }],
+ },
+ {
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'meta': {
+ 'meta_inner': 'nice'
+ },
+ 'outer_field': 'value',
+ 'stats': 'en',
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta': {
+ 'meta_inner': 'nice'
+ },
+ 'meta.outer_field': 'value',
+ 'meta.stats': 'en'
+ }],
+ },
+ {
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'outer_key': 'nice',
+ 'outer_field': 'value',
+ 'stats': 'en',
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta.outer_key': 'nice',
+ 'meta.outer_field': 'value',
+ 'meta.stats': 'en'
+ }],
+ },
+ {
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'meta': 'nice',
+ 'outer_field': 'value',
+ 'stats': 'en',
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta': 'nice',
+ 'meta.outer_field': 'value',
+ 'meta.stats': 'en'
+ }],
+ },
+ ]
+ for sample in samples:
+ self.run_test(sample)
+
+ def test_recursive_meta(self):
+ samples = [{
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'outer_field': {
+ 'rec1': {
+ 'rec2': 'value'
+ }
+ },
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta.outer_field': {
+ 'rec1': {
+ 'rec2': 'value'
+ }
+ },
+ }],
+ }]
+ for sample in samples:
+ self.run_test(sample)
+
+ def test_hetero_meta(self):
+ cur_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'structured')
+ file_path = os.path.join(cur_dir, 'demo-dataset.jsonl')
+ ds = load_dataset('json', data_files=file_path)
+ ds = unify_format(ds)
+ import datetime
+ # the 'None' fields are missing fields after merging
+ sample = [{
+ 'text': "Today is Sunday and it's a happy day!",
+ 'meta': {
+ 'src': 'Arxiv',
+ 'date': datetime.datetime(2023, 4, 27, 0, 0),
+ 'version': '1.0',
+ 'author': None
+ }
+ }, {
+ 'text': 'Do you need a cup of coffee?',
+ 'meta': {
+ 'src': 'code',
+ 'date': None,
+ 'version': None,
+ 'author': 'xxx'
+ }
+ }]
+ unified_sample_list = ds.to_list()
+ self.assertEqual(unified_sample_list, sample)
+ # test nested and missing field for the following cases:
+ # 1. first row, then column
+ unified_sample_first = ds[0]
+ unified_sample_second = ds[1]
+ self.assertEqual(unified_sample_first['meta.src'], 'Arxiv')
+ self.assertEqual(unified_sample_first['meta.author'], None)
+ self.assertEqual(unified_sample_second['meta.date'], None)
+ # 2. first column, then row
+ self.assertEqual(ds['meta.src'][0], 'Arxiv')
+ self.assertEqual(ds['meta.src'][1], 'code')
+ self.assertEqual(ds['meta.author'][0], None)
+ self.assertEqual(ds['meta.date'][1], None)
+ # 3. first partial rows, then column, final row
+ unified_ds_first = ds.select([0])
+ unified_ds_second = ds.select([1])
+ self.assertEqual(unified_ds_first['meta.src'][0], 'Arxiv')
+ self.assertEqual(unified_ds_first['meta.author'][0], None)
+ self.assertEqual(unified_ds_second['meta.date'][0], None)
+
+ def test_empty_meta(self):
+ samples = [{
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'meta': {},
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta': {},
+ }],
+ }]
+ for sample in samples:
+ self.run_test(sample)
+
+ def test_empty_stats(self):
+ samples = [{
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'meta': {},
+ 'stats': {},
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta': {},
+ 'stats': {},
+ }],
+ }]
+ for sample in samples:
+ self.run_test(sample)
+
+ def test_empty_outer_fields(self):
+ samples = [{
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'meta': {},
+ 'out_field': {},
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta': {},
+ 'meta.out_field': {},
+ }],
+ }, {
+ 'source': [{
+ 'text': 'This is a test text.',
+ 'out_field': {},
+ }],
+ 'target': [{
+ 'text': 'This is a test text.',
+ 'meta.out_field': {},
+ }],
+ }]
+ for sample in samples:
+ self.run_test(sample)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/.DS_Store b/tests/ops/.DS_Store
new file mode 100644
index 000000000..2220be4b6
Binary files /dev/null and b/tests/ops/.DS_Store differ
diff --git a/tests/ops/__init__.py b/tests/ops/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/ops/common/__init__.py b/tests/ops/common/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/ops/deduplicator/__init__.py b/tests/ops/deduplicator/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/ops/deduplicator/test_document_deduplicator.py b/tests/ops/deduplicator/test_document_deduplicator.py
new file mode 100644
index 000000000..740caae18
--- /dev/null
+++ b/tests/ops/deduplicator/test_document_deduplicator.py
@@ -0,0 +1,100 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.deduplicator.document_deduplicator import \
+ DocumentDeduplicator
+
+
+class DocumentDeduplicatorTest(unittest.TestCase):
+
+ def _run_doc_dedup(self, dataset: Dataset, target_list, op):
+ dataset = dataset.map(op.compute_hash)
+ dataset, _ = op.process(dataset)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_english_deduplication(self):
+ ds_list = [
+ {
+ 'text': 'Today is Sunday and it\'s a happy day!'
+ },
+ {
+ 'text': 'Do you need a cup of coffee?'
+ },
+ {
+ 'text': 'Today is sunday and it\'s a happy day!'
+ },
+ {
+ 'text':
+ 'This paper proposed a novel method on LLM pretraining.'
+ },
+ {
+ 'text':
+ 'This paper proposed a novel method on LLM pretraining.'
+ },
+ ]
+ tgt_list = [{
+ 'text': 'Today is Sunday and it\'s a happy day!'
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }, {
+ 'text': 'Today is sunday and it\'s a happy day!'
+ }, {
+ 'text':
+ 'This paper proposed a novel method on LLM pretraining.'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = DocumentDeduplicator(lowercase=False, ignore_non_character=False)
+ self._run_doc_dedup(dataset, tgt_list, op)
+
+ def test_chinese_deduplication(self):
+ ds_list = [
+ {
+ 'text': '你好,请问你是谁'
+ },
+ {
+ 'text': '欢迎来到阿里巴巴!'
+ },
+ {
+ 'text':
+ '第九届会议\n2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
+ '律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1.'
+ },
+ {
+ 'text':
+ '第九届会议\n2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
+ '律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1.'
+ },
+ {
+ 'text':
+ '第九届会议\n时间:2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
+ '律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1.'
+ },
+ ]
+ tgt_list = [
+ {
+ 'text': '你好,请问你是谁'
+ },
+ {
+ 'text': '欢迎来到阿里巴巴!'
+ },
+ {
+ 'text':
+ '第九届会议\n2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
+ '律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1.'
+ },
+ {
+ 'text':
+ '第九届会议\n时间:2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
+ '律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1.'
+ },
+ ]
+ dataset = Dataset.from_list(ds_list)
+ op = DocumentDeduplicator(lowercase=False, ignore_non_character=False)
+ self._run_doc_dedup(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/deduplicator/test_document_minhash_deduplicator.py b/tests/ops/deduplicator/test_document_minhash_deduplicator.py
new file mode 100644
index 000000000..b60209e8b
--- /dev/null
+++ b/tests/ops/deduplicator/test_document_minhash_deduplicator.py
@@ -0,0 +1,962 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.deduplicator.document_minhash_deduplicator import \
+ DocumentMinhashDeduplicator
+
+
+class DocumentMinhashDeduplicatorTest(unittest.TestCase):
+
+ def _run_minhash_dedup(self, dataset: Dataset, target_list, op):
+ dataset = dataset.map(op.compute_hash)
+ dataset, _ = op.process(dataset)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_english_deduplication(self):
+ ds_list = [
+ {
+ 'text': 'Today is Sunday and it\'s a happy day!'
+ },
+ {
+ 'text': 'Do you need a cup of coffee?'
+ },
+ {
+ 'text': 'Today is sunday and it\'s really a happy day!'
+ },
+ {
+ 'text':
+ 'This paper proposed a novel method on LLM pretraining.'
+ },
+ {
+ 'text':
+ 'Smithfield employs 3,700 people at its plant in Sioux Falls, '
+ 'South Dakota. The plant slaughters 19,500 pigs a day — 5 '
+ 'percent of U.S. pork. Most of the workers are immigrants '
+ 'from Ethiopia, Mexico, South Sudan, Honduras, Myanmar, '
+ 'Somalia, Guatemala, and other poor countries.\n\nInevitably '
+ 'workers must pass within one foot of hundreds of colleagues '
+ 'in the hallways, locker rooms, cafeterias, and cutting '
+ 'lines. The same conditions have spurred Covid-19 outbreaks '
+ 'at meat plants from Minnesota and Wisconsin to Colorado, '
+ 'Nebraska, Missouri, Iowa, Pennsylvania, North Carolina, and '
+ 'Georgia.\n\n801 workers at the Sioux Falls plant have tested '
+ 'positive, together with 206 people close to them. The '
+ 'outbreak has killed Agustín Rodríguez Martínez, aged 64, an '
+ 'employee with two decades of experience originally from El '
+ 'Salvador, and Craig Allen Franken, 61, who worked for '
+ 'Smithfield his entire adult life.\n\nThe company knew of its '
+ 'first infection on March 24 or earlier. The virus spread '
+ 'exponentially for several weeks. Ahead of Easter Sunday and '
+ 'Monday (April 12-13), Smithfield promised to “completely '
+ 'shutter” to sanitize and put up cardboard and plastic sheet '
+ 'dividers. This would not end transmission, as potentially '
+ 'hundreds of staff were already carrying the virus. But even '
+ 'during this “shutdown,” many cars were seen in the parking '
+ 'lot. The mayor admits that the company lied, and the local '
+ 'AFL-CIO alleges the plant ran 60 percent production. On '
+ 'Easter, with 238 known infections, Smithfield finally '
+ 'agreed to shut down indefinitely after a request from the '
+ 'mayor and the governor. Yet the company insisted on waiting '
+ 'three more days to actually halt production.\n\nSmithfield '
+ 'denied contributing to the outbreak, saying it took a “very '
+ 'proactive approach.” Relying on racism, the company blamed '
+ 'workers for getting themselves sick. A spokesperson said '
+ 'the outbreak was so severe because of the plant’s “large '
+ 'immigrant population,” claming “Living circumstances in '
+ 'certain cultures are different than they are with your '
+ 'traditional American family.” They slandered the workers as '
+ 'dirty, ignorant, and untrustworthy with help from governor '
+ 'Kristi Noem, who claimed, “99 percent of what’s going on '
+ 'today wasn’t happening inside the facility. It was more at '
+ 'home, where these employees were going home and spreading '
+ 'some of the virus” by living too close together.\n\nOne '
+ 'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
+ 'from South Sudan, says, “With how we work on the line, '
+ 'I would say I got sick because of them not taking safety '
+ 'measures.” His job is “really, really close” to other '
+ 'workers chopping fresh-killed pigs. “The job is so heavy. '
+ 'You have to breathe so hard.”\n\nIn early March, '
+ 'union officials requested masks, overcoats, entrance '
+ 'checking for fevers, and less crowding in 500-capacity '
+ 'cafeterias. But Smithfield waited on most safety measures '
+ 'until early April. Only April 6 did they start checking for '
+ 'fevers. Instead of protective masks, they gave out beard '
+ 'nets.\n\nSmithfield concealed infections with a policy of '
+ 'informing only employees whose work stations were in the '
+ 'same area as a person who tested positive. The fact that '
+ 'workers are required to move around was willfully ignored. '
+ 'One worker who tested positive said, “I clearly would have '
+ 'gotten it at the factory. This week I have worked on three '
+ 'different floors. I’ve eaten in two different cafeterias … '
+ 'I’ve been walking through the whole place.” Employees from '
+ 'the eighth floor of the plant were quarantined, '
+ 'but everyone else was told to keep working.\n\nWhat Is '
+ 'Really Going On?\n\nAverage plant wages are around $16 an '
+ 'hour. Smithfield never raised them. Instead, they offered '
+ '$500 to employees who could go all of April without an '
+ 'unapproved day off. The company says their “Responsibility '
+ 'Bonuses” show their “immense gratefulness” to employees '
+ '“for their selfless sacrifices.”\n\nMeanwhile, the local '
+ 'Argus Leader wrote union members wanted essential-worker '
+ 'hazard pay, which “would be considered hourly compensation '
+ 'about 1.5 or two times their normal pay.” One worker said, '
+ '“I feel like they’re bribing us with [the bonus] to come to '
+ 'work sick. That’s how you know they don’t care.”\n\nBoth '
+ 'Sioux Falls workers killed by Covid-19 were in their '
+ 'sixties. It is unconscionable that they were still working. '
+ 'All meatpackers over 50 should be on paid leave. Agustín '
+ 'Rodríguez, 64, had a rough job sawing the legs off dead '
+ 'pigs. He mopped floors with a fever shortly before he was '
+ 'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
+ 'plant, he claimed, “We have continued to run our facilities '
+ 'for one reason: to sustain our nation’s food supply.” This '
+ 'is an effort to sweep Smithfield’s abuses under the rug, '
+ 'as if the company were operating for public benefit. This '
+ 'patriotic propaganda that all Americans are in it together '
+ 'is like a drug to keep workers from getting '
+ 'organized.\n\nThe major union in the industry, including at '
+ 'Smithfield, is the United Food and Commercial Workers union '
+ '(UFCW). What union leaders have done is ultimately '
+ 'troubling.\n\nCan Workers Fight?\n\nLocal AFL-CIO president '
+ 'Kooper Caraway has publicly said management delayed safety '
+ 'action as long as possible for profit. But while some '
+ 'workers were demanding a two-week shutdown, Caraway told '
+ 'the Argus Leader that was unrealistic because the '
+ 'government considers the plant essential. He suggested the '
+ 'union would be happy with minimal safety measures: “Even if '
+ '10 people get exposed in a day rather than 11. If you can '
+ 'implement a program where even one or two less people get '
+ 'exposed during a shift, that’s one or two less people.” Of '
+ 'course reducing infections is good, but suggesting workers '
+ 'would be satisfied if the company allowed 90% of the '
+ 'contagion to continue is horrifying.\n\nThe response of '
+ 'UFCW leadership was worse. As the disease was exploding, '
+ 'they told the Argus Leader, “We applaud [Smithfield’s] '
+ 'decision to temporarily close the plant [over Easter '
+ 'weekend] to push for an even safer work environment.” What '
+ 'does “even safer” mean in this context?\n\nThe union '
+ 'bureaucracy has taken weak action elsewhere. In '
+ 'Pennsylvania, the UFCW negotiated $2 hazard pay for two '
+ 'months with Cargill Meat — the same pandemic premium Amazon '
+ 'gave workers without a union. In Nebraska, the UFCW '
+ 'negotiated $4 hazard pay for one month with meat giant '
+ 'JBS.\n\nThe union has said nothing about forcing companies '
+ 'to send older workers home with pay, even though a '
+ '70-year-old shop steward and a 78-year-old grandfather '
+ 'working at JBS plants were killed by Covid-19. Smithfield '
+ 'workers were promised only two weeks of shutdown pay. For '
+ 'many, this compensation is half their normal paycheck '
+ 'because they routinely put in 66 hour weeks — overtime that '
+ 'costs exhaustion and chronic pain.\n\nUnion officials '
+ 'endeavor to cooperate with the meat companies. An Iowa UFCW '
+ 'president actually suggested it might be impossible for '
+ 'plants to move workers a full six feet apart and told the '
+ 'Des Moines Register, “We can’t stop the plants. If we stop '
+ 'the plants from running, we stop feeding the country. We '
+ 'want to do everything we can to make sure the employees are '
+ 'safe to keep the plant running.”\n\nEvery part of this '
+ 'explanation directly overlaps with what the Smithfield CEO '
+ 'said. Unfortunately, it amounts to accepting the company’s '
+ 'excuses.\n\nThey claim that workers who do hard physical '
+ 'labor, waking up at 4 a.m. and often working six days a '
+ 'week for years, would be guilty of taking food away from '
+ 'the people and hurting America if they dared to fight for '
+ 'their human needs. But nothing is said about the company '
+ 'raking in profits and even murdering workers to increase '
+ 'them.\n\nSmithfield’s parent company W.H. Group, '
+ 'which slaughters around 30 million pigs per year in plants '
+ 'in both the United States and China, saw its profits '
+ 'skyrocket by about one third in 2019 to $1.38 billion. It '
+ 'is disturbing that UFCW officials do not bring up these '
+ 'soaring profits in their response to the outbreaks. Reuters '
+ 'published a report on the corporation’s financial success '
+ 'in late March. The head of W.H. Group had touted to the '
+ 'media that it got through the pandemic in China with very '
+ 'limited impact on production.\n\nIt is true that many '
+ 'Smithfield workers are reasonably afraid for their jobs and '
+ 'want to keep working. A 25-year-old employee explained, '
+ '“I have a lot of bills. My baby’s coming soon — I have to '
+ 'work.” At the same time, he was afraid of infecting his '
+ 'pregnant wife. His spouse, a former employee, '
+ 'said bitterly, “Smithfield— they don’t care about '
+ 'employees. They only care about their money.”\n\nWorkers '
+ 'are pressured in these two painful directions. Nonetheless, '
+ 'work can mean solidarity. Before Smithfield even checked '
+ 'temperatures, there was a “sick-out” strike without union '
+ 'support by 800 to 1,000 workers at a JBS meat factory in '
+ 'Colorado. Hundreds of workers also called in sick days at a '
+ 'Nebraska JBS plant.\n\nTrade union leaders won’t even '
+ 'whisper the word “strike” when thousands of workers are '
+ 'thinking about it. They are limiting themselves to polite '
+ 'requests. We need a workers’ movement that asks who '
+ 'controls the factory, that threatens to disrupt the bosses’ '
+ 'profits, and that allows workers to use their immense power '
+ '— this could change the meat industry and the world. '
+ },
+ {
+ 'text':
+ 'Smithfield employs 3,700 people at its plants in Sioux '
+ 'Falls, South Dakota. The plant slaughters 19,500 pig a day '
+ '— 5 percent of U.S. pork. Most of the workers are '
+ 'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
+ 'Myanmar, Somalia, Guatemala, and other poor '
+ 'countries.\n\nInevitably workers must pass within one foot '
+ 'of hundreds of colleagues in the hallways, locker rooms, '
+ 'cafeterias, and cutting lines. The same conditions have '
+ 'spurred Covid-19 outbreaks at meat plants from Minnesota '
+ 'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
+ 'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
+ 'at the Sioux Falls plant have tested positive, together '
+ 'with 206 people close to them. The outbreak has killed '
+ 'Agustín Rodríguez Martínez, aged 64, an employee with two '
+ 'decades of experience originally from El Salvador, '
+ 'and Craig Allen Franken, 61, who worked for Smithfield his '
+ 'entire adult life.\n\nThe company knew of its first '
+ 'infection on March 24 or earlier. The virus spread '
+ 'exponentially for several weeks. Ahead of Easter Sunday and '
+ 'Monday (April 12-13), Smithfield promised to “completely '
+ 'shutter” to sanitize and put up cardboard and plastic sheet '
+ 'dividers. This would not end transmission, as potentially '
+ 'hundreds of staff were already carrying the virus. But even '
+ 'during this “shutdown,” many cars were seen in the parking '
+ 'lot. The mayor admits that the company lied, and the local '
+ 'AFL-CIO alleges the plant ran 60 percent production. On '
+ 'Easter, with 238 known infections, Smithfield finally '
+ 'agreed to shut down indefinitely after a request from the '
+ 'mayor and the governor. Yet the company insisted on waiting '
+ 'three more days to actually halt production.\n\nSmithfield '
+ 'denied contributing to the outbreak, saying it took a “very '
+ 'proactive approach.” Relying on racism, the company blamed '
+ 'workers for getting themselves sick. A spokesperson said '
+ 'the outbreak was so severe because of the plant’s “large '
+ 'immigrant population,” claming “Living circumstances in '
+ 'certain cultures are different than they are with your '
+ 'traditional American family.” They slandered the workers as '
+ 'dirty, ignorant, and untrustworthy with help from governor '
+ 'Kristi Noem, who claimed, “99 percent of what’s going on '
+ 'today wasn’t happening inside the facility. It was more at '
+ 'home, where these employees were going home and spreading '
+ 'some of the virus” by living too close together.\n\nOne '
+ 'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
+ 'from South Sudan, says, “With how we work on the line, '
+ 'I would say I got sick because of them not taking safety '
+ 'measures.” His job is “really, really close” to other '
+ 'workers chopping fresh-killed pigs. “The job is so heavy. '
+ 'You have to breathe so hard.”\n\nIn early March, '
+ 'union officials requested masks, overcoats, entrance '
+ 'checking for fevers, and less crowding in 500-capacity '
+ 'cafeterias. But Smithfield waited on most safety measures '
+ 'until early April. Only April 6 did they start checking for '
+ 'fevers. Instead of protective masks, they gave out beard '
+ 'nets.\n\nSmithfield concealed infections with a policy of '
+ 'informing only employees whose work stations were in the '
+ 'same area as a person who tested positive. The fact that '
+ 'workers are required to move around was willfully ignored. '
+ 'One worker who tested positive said, “I clearly would have '
+ 'gotten it at the factory. This week I have worked on three '
+ 'different floors. I’ve eaten in two different cafeterias … '
+ 'I’ve been walking through the whole place.” Employees from '
+ 'the eighth floor of the plant were quarantined, '
+ 'but everyone else was told to keep working.\n\nWhat Is '
+ 'Really Going On?\n\nAverage plant wages are around $16 an '
+ 'hour. Smithfield never raised them. Instead, they offered '
+ '$500 to employees who could go all of April without an '
+ 'unapproved day off. The company says their “Responsibility '
+ 'Bonuses” show their “immense gratefulness” to employees '
+ '“for their selfless sacrifices.”\n\nMeanwhile, the local '
+ 'Argus Leader wrote union members wanted essential-worker '
+ 'hazard pay, which “would be considered hourly compensation '
+ 'about 1.5 or two times their normal pay.” One worker said, '
+ '“I feel like they’re bribing us with [the bonus] to come to '
+ 'work sick. That’s how you know they don’t care.”\n\nBoth '
+ 'Sioux Falls workers killed by Covid-19 were in their '
+ 'sixties. It is unconscionable that they were still working. '
+ 'All meatpackers over 50 should be on paid leave. Agustín '
+ 'Rodríguez, 64, had a rough job sawing the legs off dead '
+ 'pigs. He mopped floors with a fever shortly before he was '
+ 'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
+ 'plant, he claimed, “We have continued to run our facilities '
+ 'for one reason: to sustain our nation’s food supply.” This '
+ 'is an effort to sweep Smithfield’s abuses under the rug, '
+ 'as if the company were operating for public benefit. This '
+ 'patriotic propaganda that all Americans are in it together '
+ 'is like a drug to keep workers from getting '
+ 'organized.\n\nThe major union in the industry, including at '
+ 'Smithfield, is the United Food and Commercial Workers union '
+ '(UFCW). What union leaders have done is ultimately '
+ 'troubling.\n\nCan Workers Fight?\n\nLocal AFL-CIO president '
+ 'Kooper Caraway has publicly said management delayed safety '
+ 'action as long as possible for profit. But while some '
+ 'workers were demanding a two-week shutdown, Caraway told '
+ 'the Argus Leader that was unrealistic because the '
+ 'government considers the plant essential. He suggested the '
+ 'union would be happy with minimal safety measures: “Even if '
+ '10 people get exposed in a day rather than 11. If you can '
+ 'implement a program where even one or two less people get '
+ 'exposed during a shift, that’s one or two less people.” Of '
+ 'course reducing infections is good, but suggesting workers '
+ 'would be satisfied if the company allowed 90% of the '
+ 'contagion to continue is horrifying.\n\nThe response of '
+ 'UFCW leadership was worse. As the disease was exploding, '
+ 'they told the Argus Leader, “We applaud [Smithfield’s] '
+ 'decision to temporarily close the plant [over Easter '
+ 'weekend] to push for an even safer work environment.” What '
+ 'does “even safer” mean in this context?\n\nThe union '
+ 'bureaucracy has taken weak action elsewhere. In '
+ 'Pennsylvania, the UFCW negotiated $2 hazard pay for two '
+ 'months with Cargill Meat — the same pandemic premium Amazon '
+ 'gave workers without a union. In Nebraska, the UFCW '
+ 'negotiated $4 hazard pay for one month with meat giant '
+ 'JBS.\n\nThe union has said nothing about forcing companies '
+ 'to send older workers home with pay, even though a '
+ '70-year-old shop steward and a 78-year-old grandfather '
+ 'working at JBS plants were killed by Covid-19. Smithfield '
+ 'workers were promised only two weeks of shutdown pay. For '
+ 'many, this compensation is half their normal paycheck '
+ 'because they routinely put in 66 hour weeks — overtime that '
+ 'costs exhaustion and chronic pain.\n\nUnion officials '
+ 'endeavor to cooperate with the meat companies. An Iowa UFCW '
+ 'president actually suggested it might be impossible for '
+ 'plants to move workers a full six feet apart and told the '
+ 'Des Moines Register, “We can’t stop the plants. If we stop '
+ 'the plants from running, we stop feeding the country. We '
+ 'want to do everything we can to make sure the employees are '
+ 'safe to keep the plant running.”\n\nEvery part of this '
+ 'explanation directly overlaps with what the Smithfield CEO '
+ 'said. Unfortunately, it amounts to accepting the company’s '
+ 'excuses.\n\nThey claim that workers who do hard physical '
+ 'labor, waking up at 4 a.m. and often working six days a '
+ 'week for years, would be guilty of taking food away from '
+ 'the people and hurting America if they dared to fight for '
+ 'their human needs. But nothing is said about the company '
+ 'raking in profits and even murdering workers to increase '
+ 'them.\n\nSmithfield’s parent company W.H. Group, '
+ 'which slaughters around 30 million pigs per year in plants '
+ 'in both the United States and China, saw its profits '
+ 'skyrocket by about one third in 2019 to $1.38 billion. It '
+ 'is disturbing that UFCW officials do not bring up these '
+ 'soaring profits in their response to the outbreaks. Reuters '
+ 'published a report on the corporation’s financial success '
+ 'in late March. The head of W.H. Group had touted to the '
+ 'media that it got through the pandemic in China with very '
+ 'limited impact on production.\n\nIt is true that many '
+ 'Smithfield workers are reasonably afraid for their jobs and '
+ 'want to keep working. A 25-year-old employee explained, '
+ '“I have a lot of bills. My baby’s coming soon — I have to '
+ 'work.” At the same time, he was afraid of infecting his '
+ 'pregnant wife. His spouse, a former employee, '
+ 'said bitterly, “Smithfield— they don’t care about '
+ 'employees. They only care about their money.”\n\nWorkers '
+ 'are pressured in these two painful directions. Nonetheless, '
+ 'work can mean solidarity. Before Smithfield even checked '
+ 'temperatures, there was a “sick-out” strike without union '
+ 'support by 800 to 1,000 workers at a JBS meat factory in '
+ 'Colorado. Hundreds of workers also called in sick days at a '
+ 'Nebraska JBS plant.\n\nTrade union leaders won’t even '
+ 'whisper the word “strike” when thousands of workers are '
+ 'thinking about it. They are limiting themselves to polite '
+ 'requests. We need a workers’ movement that asks who '
+ 'controls the factory, that threatens to disrupt the bosses’ '
+ 'profits, and that allows workers to use their immense power '
+ '— this could change the meat industry and the world. '
+ },
+ {
+ 'text':
+ 'Smithfield employs 3,700 people at its plant in Sioux '
+ 'Falls, South Dakota. The plant slaughters 19,500 pigs a day '
+ '— 5 percent of U.S. pork. Most of the workers are '
+ 'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
+ 'Myanmar, Somalia, Guatemala, and other poor '
+ 'countries.\n\nInevitably workers must pass within one foot '
+ 'of hundreds of colleagues in the hallways, locker rooms, '
+ 'cafeterias, and cutting lines. The same conditions have '
+ 'spurred Covid-19 outbreaks at meat plants from Minnesota '
+ 'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
+ 'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
+ 'at the Sioux Falls plant have tested positive, together '
+ 'with 206 people close to them. The outbreak has killed '
+ 'Agustín Rodríguez Martínez, aged 64, an employee with two '
+ 'decades of experience originally from El Salvador, '
+ 'and Craig Allen Franken, 61, who worked for Smithfield his '
+ 'entire adult life.\n\nThe company knew of its first '
+ 'infection on March 24 or earlier. The virus spread '
+ 'exponentially for several weeks. Ahead of Easter Sunday and '
+ 'Monday (April 12-13), Smithfield promised to “completely '
+ 'shutter” to sanitize and put up cardboard and plastic sheet '
+ 'dividers. This would not end transmission, as potentially '
+ 'hundreds of staff were already carrying the virus. But even '
+ 'during this “shutdown,” many cars were seen in the parking '
+ 'lot. The mayor admits that the company lied, and the local '
+ 'AFL-CIO alleges the plant ran 60 percent production. On '
+ 'Easter, with 238 known infections, Smithfield finally '
+ 'agreed to shut down indefinitely after a request from the '
+ 'mayor and the governor. Yet the company insisted on waiting '
+ 'three more days to actually halt production.\n\nSmithfield '
+ 'denied contributing to the outbreak, saying it took a “very '
+ 'proactive approach.” Relying on racism, the company blamed '
+ 'workers for getting themselves sick. A spokesperson said '
+ 'the outbreak was so severe because of the plant’s “large '
+ 'immigrant population,” claming “Living circumstances in '
+ 'certain cultures are different than they are with your '
+ 'traditional American family.” They slandered the workers as '
+ 'dirty, ignorant, and untrustworthy with help from governor '
+ 'Kristi Noem, who claimed, “99 percent of what’s going on '
+ 'today wasn’t happening inside the facility. It was more at '
+ 'home, where these employees were going home and spreading '
+ 'some of the virus” by living too close together.\n\nOne '
+ 'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
+ 'from South Sudan, says, “With how we work on the line, '
+ 'I would say I got sick because of them not taking safety '
+ 'measures.” His job is “really, really close” to other '
+ 'workers chopping fresh-killed pigs. “The job is so heavy. '
+ 'You have to breathe so hard.”\n\nIn early March, '
+ 'union officials requested masks, overcoats, entrance '
+ 'checking for fevers, and less crowding in 500-capacity '
+ 'cafeterias. But Smithfield waited on most safety measures '
+ 'until early April. Only April 6 did they start checking for '
+ 'fevers. Instead of protective masks, they gave out beard '
+ 'nets.\n\nSmithfield concealed infections with a policy of '
+ 'informing only employees whose work stations were in the '
+ 'same area as a person who tested positive. The fact that '
+ 'workers are required to move around was willfully ignored. '
+ 'One worker who tested positive said, “I clearly would have '
+ 'gotten it at the factory. This week I have worked on three '
+ 'different floors. I’ve eaten in two different cafeterias … '
+ 'I’ve been walking through the whole place.” Employees from '
+ 'the eighth floor of the plant were quarantined, '
+ 'but everyone else was told to keep working.\n\nWhat Is '
+ 'Really Going On?\n\nAverage plant wages are around $16 an '
+ 'hour. Smithfield never raised them. Instead, they offered '
+ '$500 to employees who could go all of April without an '
+ 'unapproved day off. The company says their “Responsibility '
+ 'Bonuses” show their “immense gratefulness” to employees '
+ '“for their selfless sacrifices.”\n\nMeanwhile, the local '
+ 'Argus Leader wrote union members wanted essential-worker '
+ 'hazard pay, which “would be considered hourly compensation '
+ 'about 1.5 or two times their normal pay.” One worker said, '
+ '“I feel like they’re bribing us with [the bonus] to come to '
+ 'work sick. That’s how you know they don’t care.”\n\nBoth '
+ 'Sioux Falls workers killed by Covid-19 were in their '
+ 'sixties. It is unconscionable that they were still working. '
+ 'All meatpackers over 50 should be on paid leave. Agustín '
+ 'Rodríguez, 64, had a rough job sawing the legs off dead '
+ 'pigs. He mopped floors with a fever shortly before he was '
+ 'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
+ 'plant, he claimed, “We have continued to run our facilities '
+ 'for one reason: to sustain our nation’s food supply.” This '
+ 'is an effort to sweep Smithfield’s abuses under the rug, '
+ 'as if the company were operating for public benefit. This '
+ 'patriotic propaganda that all Americans are in it together '
+ 'is like a drug to keep workers from getting organized. '
+ },
+ {
+ 'text':
+ 'Smithfield employs 3,700 people at its plants in Sioux '
+ 'Falls, South Dakota. The plant slaughters 19,500 pig a day '
+ '— 5 percent of U.S. pork. Most of the workers are '
+ 'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
+ 'Myanmar, Somalia, Guatemala, and other poor '
+ 'countries.\n\nInevitably workers must pass within one foot '
+ 'of hundreds of colleagues in the hallways, locker rooms, '
+ 'cafeterias, and cutting lines. The same conditions have '
+ 'spurred Covid-19 outbreaks at meat plants from Minnesota '
+ 'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
+ 'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
+ 'at the Sioux Falls plant have tested positive, together '
+ 'with 206 people close to them. The outbreak has killed '
+ 'Agustín Rodríguez Martínez, aged 64, an employee with two '
+ 'decades of experience originally from El Salvador, '
+ 'and Craig Allen Franken, 61, who worked for Smithfield his '
+ 'entire adult life.\n\nThe company knew of its first '
+ 'infection on March 24 or earlier. The virus spread '
+ 'exponentially for several weeks. Ahead of Easter Sunday and '
+ 'Monday (April 12-13), Smithfield promised to “completely '
+ 'shutter” to sanitize and put up cardboard and plastic sheet '
+ 'dividers. This would not end transmission, as potentially '
+ 'hundreds of staff were already carrying the virus. But even '
+ 'during this “shutdown,” many cars were seen in the parking '
+ 'lot. The mayor admits that the company lied, and the local '
+ 'AFL-CIO alleges the plant ran 60 percent production. On '
+ 'Easter, with 238 known infections, Smithfield finally '
+ 'agreed to shut down indefinitely after a request from the '
+ 'mayor and the governor. Yet the company insisted on waiting '
+ 'three more days to actually halt production.\n\nSmithfield '
+ 'denied contributing to the outbreak, saying it took a “very '
+ 'proactive approach.” Relying on racism, the company blamed '
+ 'workers for getting themselves sick. A spokesperson said '
+ 'the outbreak was so severe because of the plant’s “large '
+ 'immigrant population,” claming “Living circumstances in '
+ 'certain cultures are different than they are with your '
+ 'traditional American family.” They slandered the workers as '
+ 'dirty, ignorant, and untrustworthy with help from governor '
+ 'Kristi Noem, who claimed, “99 percent of what’s going on '
+ 'today wasn’t happening inside the facility. It was more at '
+ 'home, where these employees were going home and spreading '
+ 'some of the virus” by living too close together.\n\nOne '
+ 'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
+ 'from South Sudan, says, “With how we work on the line, '
+ 'I would say I got sick because of them not taking safety '
+ 'measures.” His job is “really, really close” to other '
+ 'workers chopping fresh-killed pigs. “The job is so heavy. '
+ 'You have to breathe so hard.”\n\nIn early March, '
+ 'union officials requested masks, overcoats, entrance '
+ 'checking for fevers, and less crowding in 500-capacity '
+ 'cafeterias. But Smithfield waited on most safety measures '
+ 'until early April. Only April 6 did they start checking for '
+ 'fevers. Instead of protective masks, they gave out beard '
+ 'nets.\n\nSmithfield concealed infections with a policy of '
+ 'informing only employees whose work stations were in the '
+ 'same area as a person who tested positive. The fact that '
+ 'workers are required to move around was willfully ignored. '
+ 'One worker who tested positive said, “I clearly would have '
+ 'gotten it at the factory. This week I have worked on three '
+ 'different floors. I’ve eaten in two different cafeterias … '
+ 'I’ve been walking through the whole place.” Employees from '
+ 'the eighth floor of the plant were quarantined, '
+ 'but everyone else was told to keep working.\n\nWhat Is '
+ 'Really Going On?\n\nAverage plant wages are around $16 an '
+ 'hour. Smithfield never raised them. Instead, they offered '
+ '$500 to employees who could go all of April without an '
+ 'unapproved day off. The company says their “Responsibility '
+ 'Bonuses” show their “immense gratefulness” to employees '
+ '“for their selfless sacrifices.”\n\nMeanwhile, the local '
+ 'Argus Leader wrote union members wanted essential-worker '
+ 'hazard pay, which “would be considered hourly compensation '
+ 'about 1.5 or two times their normal pay.” One worker said, '
+ '“I feel like they’re bribing us with [the bonus] to come to '
+ 'work sick. That’s how you know they don’t care.”\n\nBoth '
+ 'Sioux Falls workers killed by Covid-19 were in their '
+ 'sixties. It is unconscionable that they were still working. '
+ 'All meatpackers over 50 should be on paid leave. Agustín '
+ 'Rodríguez, 64, had a rough job sawing the legs off dead '
+ 'pigs. He mopped floors with a fever shortly before he was '
+ 'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
+ 'plant, he claimed, “We have continued to run our facilities '
+ 'for one reason: to sustain our nation’s food supply.” This '
+ 'is an effort to sweep Smithfield’s abuses under the rug, '
+ 'as if the company were operating for public benefit. This '
+ 'patriotic propaganda that all Americans are in it together '
+ 'is like a drug to keep workers from getting organized. '
+ },
+ {
+ 'text':
+ 'This paper proposed a novel method on LLM pretraining.'
+ },
+ ]
+ tgt_list = [
+ {
+ 'text': 'Today is Sunday and it\'s a happy day!'
+ },
+ {
+ 'text': 'Do you need a cup of coffee?'
+ },
+ {
+ 'text': 'Today is sunday and it\'s really a happy day!'
+ },
+ {
+ 'text':
+ 'This paper proposed a novel method on LLM pretraining.'
+ },
+ {
+ 'text':
+ 'Smithfield employs 3,700 people at its plant in Sioux '
+ 'Falls, South Dakota. The plant slaughters 19,500 pigs a day '
+ '— 5 percent of U.S. pork. Most of the workers are '
+ 'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
+ 'Myanmar, Somalia, Guatemala, and other poor '
+ 'countries.\n\nInevitably workers must pass within one foot '
+ 'of hundreds of colleagues in the hallways, locker rooms, '
+ 'cafeterias, and cutting lines. The same conditions have '
+ 'spurred Covid-19 outbreaks at meat plants from Minnesota '
+ 'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
+ 'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
+ 'at the Sioux Falls plant have tested positive, together '
+ 'with 206 people close to them. The outbreak has killed '
+ 'Agustín Rodríguez Martínez, aged 64, an employee with two '
+ 'decades of experience originally from El Salvador, '
+ 'and Craig Allen Franken, 61, who worked for Smithfield his '
+ 'entire adult life.\n\nThe company knew of its first '
+ 'infection on March 24 or earlier. The virus spread '
+ 'exponentially for several weeks. Ahead of Easter Sunday and '
+ 'Monday (April 12-13), Smithfield promised to “completely '
+ 'shutter” to sanitize and put up cardboard and plastic sheet '
+ 'dividers. This would not end transmission, as potentially '
+ 'hundreds of staff were already carrying the virus. But even '
+ 'during this “shutdown,” many cars were seen in the parking '
+ 'lot. The mayor admits that the company lied, and the local '
+ 'AFL-CIO alleges the plant ran 60 percent production. On '
+ 'Easter, with 238 known infections, Smithfield finally '
+ 'agreed to shut down indefinitely after a request from the '
+ 'mayor and the governor. Yet the company insisted on waiting '
+ 'three more days to actually halt production.\n\nSmithfield '
+ 'denied contributing to the outbreak, saying it took a “very '
+ 'proactive approach.” Relying on racism, the company blamed '
+ 'workers for getting themselves sick. A spokesperson said '
+ 'the outbreak was so severe because of the plant’s “large '
+ 'immigrant population,” claming “Living circumstances in '
+ 'certain cultures are different than they are with your '
+ 'traditional American family.” They slandered the workers as '
+ 'dirty, ignorant, and untrustworthy with help from governor '
+ 'Kristi Noem, who claimed, “99 percent of what’s going on '
+ 'today wasn’t happening inside the facility. It was more at '
+ 'home, where these employees were going home and spreading '
+ 'some of the virus” by living too close together.\n\nOne '
+ 'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
+ 'from South Sudan, says, “With how we work on the line, '
+ 'I would say I got sick because of them not taking safety '
+ 'measures.” His job is “really, really close” to other '
+ 'workers chopping fresh-killed pigs. “The job is so heavy. '
+ 'You have to breathe so hard.”\n\nIn early March, '
+ 'union officials requested masks, overcoats, entrance '
+ 'checking for fevers, and less crowding in 500-capacity '
+ 'cafeterias. But Smithfield waited on most safety measures '
+ 'until early April. Only April 6 did they start checking for '
+ 'fevers. Instead of protective masks, they gave out beard '
+ 'nets.\n\nSmithfield concealed infections with a policy of '
+ 'informing only employees whose work stations were in the '
+ 'same area as a person who tested positive. The fact that '
+ 'workers are required to move around was willfully ignored. '
+ 'One worker who tested positive said, “I clearly would have '
+ 'gotten it at the factory. This week I have worked on three '
+ 'different floors. I’ve eaten in two different cafeterias … '
+ 'I’ve been walking through the whole place.” Employees from '
+ 'the eighth floor of the plant were quarantined, '
+ 'but everyone else was told to keep working.\n\nWhat Is '
+ 'Really Going On?\n\nAverage plant wages are around $16 an '
+ 'hour. Smithfield never raised them. Instead, they offered '
+ '$500 to employees who could go all of April without an '
+ 'unapproved day off. The company says their “Responsibility '
+ 'Bonuses” show their “immense gratefulness” to employees '
+ '“for their selfless sacrifices.”\n\nMeanwhile, the local '
+ 'Argus Leader wrote union members wanted essential-worker '
+ 'hazard pay, which “would be considered hourly compensation '
+ 'about 1.5 or two times their normal pay.” One worker said, '
+ '“I feel like they’re bribing us with [the bonus] to come to '
+ 'work sick. That’s how you know they don’t care.”\n\nBoth '
+ 'Sioux Falls workers killed by Covid-19 were in their '
+ 'sixties. It is unconscionable that they were still working. '
+ 'All meatpackers over 50 should be on paid leave. Agustín '
+ 'Rodríguez, 64, had a rough job sawing the legs off dead '
+ 'pigs. He mopped floors with a fever shortly before he was '
+ 'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
+ 'plant, he claimed, “We have continued to run our facilities '
+ 'for one reason: to sustain our nation’s food supply.” This '
+ 'is an effort to sweep Smithfield’s abuses under the rug, '
+ 'as if the company were operating for public benefit. This '
+ 'patriotic propaganda that all Americans are in it together '
+ 'is like a drug to keep workers from getting '
+ 'organized.\n\nThe major union in the industry, including at '
+ 'Smithfield, is the United Food and Commercial Workers union '
+ '(UFCW). What union leaders have done is ultimately '
+ 'troubling.\n\nCan Workers Fight?\n\nLocal AFL-CIO president '
+ 'Kooper Caraway has publicly said management delayed safety '
+ 'action as long as possible for profit. But while some '
+ 'workers were demanding a two-week shutdown, Caraway told '
+ 'the Argus Leader that was unrealistic because the '
+ 'government considers the plant essential. He suggested the '
+ 'union would be happy with minimal safety measures: “Even if '
+ '10 people get exposed in a day rather than 11. If you can '
+ 'implement a program where even one or two less people get '
+ 'exposed during a shift, that’s one or two less people.” Of '
+ 'course reducing infections is good, but suggesting workers '
+ 'would be satisfied if the company allowed 90% of the '
+ 'contagion to continue is horrifying.\n\nThe response of '
+ 'UFCW leadership was worse. As the disease was exploding, '
+ 'they told the Argus Leader, “We applaud [Smithfield’s] '
+ 'decision to temporarily close the plant [over Easter '
+ 'weekend] to push for an even safer work environment.” What '
+ 'does “even safer” mean in this context?\n\nThe union '
+ 'bureaucracy has taken weak action elsewhere. In '
+ 'Pennsylvania, the UFCW negotiated $2 hazard pay for two '
+ 'months with Cargill Meat — the same pandemic premium Amazon '
+ 'gave workers without a union. In Nebraska, the UFCW '
+ 'negotiated $4 hazard pay for one month with meat giant '
+ 'JBS.\n\nThe union has said nothing about forcing companies '
+ 'to send older workers home with pay, even though a '
+ '70-year-old shop steward and a 78-year-old grandfather '
+ 'working at JBS plants were killed by Covid-19. Smithfield '
+ 'workers were promised only two weeks of shutdown pay. For '
+ 'many, this compensation is half their normal paycheck '
+ 'because they routinely put in 66 hour weeks — overtime that '
+ 'costs exhaustion and chronic pain.\n\nUnion officials '
+ 'endeavor to cooperate with the meat companies. An Iowa UFCW '
+ 'president actually suggested it might be impossible for '
+ 'plants to move workers a full six feet apart and told the '
+ 'Des Moines Register, “We can’t stop the plants. If we stop '
+ 'the plants from running, we stop feeding the country. We '
+ 'want to do everything we can to make sure the employees are '
+ 'safe to keep the plant running.”\n\nEvery part of this '
+ 'explanation directly overlaps with what the Smithfield CEO '
+ 'said. Unfortunately, it amounts to accepting the company’s '
+ 'excuses.\n\nThey claim that workers who do hard physical '
+ 'labor, waking up at 4 a.m. and often working six days a '
+ 'week for years, would be guilty of taking food away from '
+ 'the people and hurting America if they dared to fight for '
+ 'their human needs. But nothing is said about the company '
+ 'raking in profits and even murdering workers to increase '
+ 'them.\n\nSmithfield’s parent company W.H. Group, '
+ 'which slaughters around 30 million pigs per year in plants '
+ 'in both the United States and China, saw its profits '
+ 'skyrocket by about one third in 2019 to $1.38 billion. It '
+ 'is disturbing that UFCW officials do not bring up these '
+ 'soaring profits in their response to the outbreaks. Reuters '
+ 'published a report on the corporation’s financial success '
+ 'in late March. The head of W.H. Group had touted to the '
+ 'media that it got through the pandemic in China with very '
+ 'limited impact on production.\n\nIt is true that many '
+ 'Smithfield workers are reasonably afraid for their jobs and '
+ 'want to keep working. A 25-year-old employee explained, '
+ '“I have a lot of bills. My baby’s coming soon — I have to '
+ 'work.” At the same time, he was afraid of infecting his '
+ 'pregnant wife. His spouse, a former employee, '
+ 'said bitterly, “Smithfield— they don’t care about '
+ 'employees. They only care about their money.”\n\nWorkers '
+ 'are pressured in these two painful directions. Nonetheless, '
+ 'work can mean solidarity. Before Smithfield even checked '
+ 'temperatures, there was a “sick-out” strike without union '
+ 'support by 800 to 1,000 workers at a JBS meat factory in '
+ 'Colorado. Hundreds of workers also called in sick days at a '
+ 'Nebraska JBS plant.\n\nTrade union leaders won’t even '
+ 'whisper the word “strike” when thousands of workers are '
+ 'thinking about it. They are limiting themselves to polite '
+ 'requests. We need a workers’ movement that asks who '
+ 'controls the factory, that threatens to disrupt the bosses’ '
+ 'profits, and that allows workers to use their immense power '
+ '— this could change the meat industry and the world. '
+ },
+ {
+ 'text':
+ 'Smithfield employs 3,700 people at its plant in Sioux '
+ 'Falls, South Dakota. The plant slaughters 19,500 pigs a day '
+ '— 5 percent of U.S. pork. Most of the workers are '
+ 'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
+ 'Myanmar, Somalia, Guatemala, and other poor '
+ 'countries.\n\nInevitably workers must pass within one foot '
+ 'of hundreds of colleagues in the hallways, locker rooms, '
+ 'cafeterias, and cutting lines. The same conditions have '
+ 'spurred Covid-19 outbreaks at meat plants from Minnesota '
+ 'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
+ 'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
+ 'at the Sioux Falls plant have tested positive, together '
+ 'with 206 people close to them. The outbreak has killed '
+ 'Agustín Rodríguez Martínez, aged 64, an employee with two '
+ 'decades of experience originally from El Salvador, '
+ 'and Craig Allen Franken, 61, who worked for Smithfield his '
+ 'entire adult life.\n\nThe company knew of its first '
+ 'infection on March 24 or earlier. The virus spread '
+ 'exponentially for several weeks. Ahead of Easter Sunday and '
+ 'Monday (April 12-13), Smithfield promised to “completely '
+ 'shutter” to sanitize and put up cardboard and plastic sheet '
+ 'dividers. This would not end transmission, as potentially '
+ 'hundreds of staff were already carrying the virus. But even '
+ 'during this “shutdown,” many cars were seen in the parking '
+ 'lot. The mayor admits that the company lied, and the local '
+ 'AFL-CIO alleges the plant ran 60 percent production. On '
+ 'Easter, with 238 known infections, Smithfield finally '
+ 'agreed to shut down indefinitely after a request from the '
+ 'mayor and the governor. Yet the company insisted on waiting '
+ 'three more days to actually halt production.\n\nSmithfield '
+ 'denied contributing to the outbreak, saying it took a “very '
+ 'proactive approach.” Relying on racism, the company blamed '
+ 'workers for getting themselves sick. A spokesperson said '
+ 'the outbreak was so severe because of the plant’s “large '
+ 'immigrant population,” claming “Living circumstances in '
+ 'certain cultures are different than they are with your '
+ 'traditional American family.” They slandered the workers as '
+ 'dirty, ignorant, and untrustworthy with help from governor '
+ 'Kristi Noem, who claimed, “99 percent of what’s going on '
+ 'today wasn’t happening inside the facility. It was more at '
+ 'home, where these employees were going home and spreading '
+ 'some of the virus” by living too close together.\n\nOne '
+ 'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
+ 'from South Sudan, says, “With how we work on the line, '
+ 'I would say I got sick because of them not taking safety '
+ 'measures.” His job is “really, really close” to other '
+ 'workers chopping fresh-killed pigs. “The job is so heavy. '
+ 'You have to breathe so hard.”\n\nIn early March, '
+ 'union officials requested masks, overcoats, entrance '
+ 'checking for fevers, and less crowding in 500-capacity '
+ 'cafeterias. But Smithfield waited on most safety measures '
+ 'until early April. Only April 6 did they start checking for '
+ 'fevers. Instead of protective masks, they gave out beard '
+ 'nets.\n\nSmithfield concealed infections with a policy of '
+ 'informing only employees whose work stations were in the '
+ 'same area as a person who tested positive. The fact that '
+ 'workers are required to move around was willfully ignored. '
+ 'One worker who tested positive said, “I clearly would have '
+ 'gotten it at the factory. This week I have worked on three '
+ 'different floors. I’ve eaten in two different cafeterias … '
+ 'I’ve been walking through the whole place.” Employees from '
+ 'the eighth floor of the plant were quarantined, '
+ 'but everyone else was told to keep working.\n\nWhat Is '
+ 'Really Going On?\n\nAverage plant wages are around $16 an '
+ 'hour. Smithfield never raised them. Instead, they offered '
+ '$500 to employees who could go all of April without an '
+ 'unapproved day off. The company says their “Responsibility '
+ 'Bonuses” show their “immense gratefulness” to employees '
+ '“for their selfless sacrifices.”\n\nMeanwhile, the local '
+ 'Argus Leader wrote union members wanted essential-worker '
+ 'hazard pay, which “would be considered hourly compensation '
+ 'about 1.5 or two times their normal pay.” One worker said, '
+ '“I feel like they’re bribing us with [the bonus] to come to '
+ 'work sick. That’s how you know they don’t care.”\n\nBoth '
+ 'Sioux Falls workers killed by Covid-19 were in their '
+ 'sixties. It is unconscionable that they were still working. '
+ 'All meatpackers over 50 should be on paid leave. Agustín '
+ 'Rodríguez, 64, had a rough job sawing the legs off dead '
+ 'pigs. He mopped floors with a fever shortly before he was '
+ 'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
+ 'plant, he claimed, “We have continued to run our facilities '
+ 'for one reason: to sustain our nation’s food supply.” This '
+ 'is an effort to sweep Smithfield’s abuses under the rug, '
+ 'as if the company were operating for public benefit. This '
+ 'patriotic propaganda that all Americans are in it together '
+ 'is like a drug to keep workers from getting organized. '
+ },
+ ]
+ dataset = Dataset.from_list(ds_list)
+ op = DocumentMinhashDeduplicator(ignore_pattern=r'\p{P}')
+ self._run_minhash_dedup(dataset, tgt_list, op)
+
+ def test_chinese_deduplication(self):
+ ds_list = [
+ {
+ 'text': '你好,请问你是谁'
+ },
+ {
+ 'text': '欢迎来到阿里巴巴!'
+ },
+ {
+ 'text':
+ '第九届会议\n2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
+ '律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1. 国际'
+ '海底管理局大会第八届会议请秘书长采取一项临时措施,设立一个自愿信托基金,'
+ '以便支付来自发展中国家的法律和技术委员会成员以及来自发展中国家的财务委员'
+ '会成员参加委员会会议的费用。\n2. 由于秘书长向会员国发出为该信托基金捐款'
+ '的请求,已收到三笔捐款,共计10 500美元。 管理局已为基金设立一个单独的账'
+ '户。\n3. 管理局第八届会议还决定,由财务委员会审查资助参加这两个委员会会'
+ '议的方式,包括审查是否可能从管理局行政预算中提供经费。\n4. 自愿信托基金'
+ '迄今收到的捐款数额很小。 这两个委员会成员虽然由缔约国提名,但他们以个人身'
+ '份当选。 因此,必须确保这些机构的成员在任期内能够参加会议并且持续不断地履'
+ '行职务。 现已注意到,这两个委员会若干成员因旅费和生活津贴费用方面有困难而'
+ '未能出席会议。 来自发展中国家成员参加会议的费用估计数见附件,其中比较了经'
+ '济舱和公务舱机票价格以及适用于金斯敦的每日生活津贴费用。 从表中可以看出,'
+ '根据不同的人数、机舱等级和会议持续时间,每年平均需要捐款120 000美元至'
+ '215 000美元。\n5. 为了指导委员会确定提供经费的方式,对某些国际组织的现'
+ '行办法作了一次简要调查。 为支付参加会议的旅费和生活费而设立信托基金最相关'
+ '的实例是2000年大会为来自发展中国家的大陆架界限委员会成员设立的自愿信托基'
+ '金。 目前这一基金正在运作,但现有资源有限。 联合国制定的程序表明,委员会'
+ '成员的政府应在规定时间内尽可能提前提出请求。 这种请求按照先到先核可的办法'
+ '处理。 提供的机票将是最直接路线的经济舱机票,每日生活津贴将按照联合国费率'
+ '提供。 购买机票的所有安排均由联合国秘书处执行。\n6. 虽然已经设立了临时性'
+ '的自愿信托基金,但是,对该基金的捐款数额很小,捐款速度很慢。 因此,除了对'
+ '信托基金提供自愿捐款的办法之外,建议委员会还可以考虑采用下列办法:\n(a) '
+ '从管理局一般行政经费累计利息中拨出一定数额的经费;\n(b) 每年从上一年预算'
+ '未动用部分中拨出规定的数额;\n(c) 从先驱投资者基金利息中拨出规定的数额。'
+ '\n7. 委员会还不妨建议由管理局秘书处依照行政规则和程序管理该基金,并向财'
+ '务委员会提出一份报告。\n附件\n资助来自发展中国家的法律和技术委员会以及财'
+ '务\n委员会成员出席会议的指示性费用(美元)\n成员\n机票\n机场\n费用\n金'
+ '斯敦每日生活\n津贴\n转机途中每日生活\n7日\n共计\n14日\n经济舱\n公务舱'
+ '\n7天=(8天每日生活\n津贴)\n14天= (15天每日生活津贴)\n商务舱\n法律和技'
+ '术委员会\n印度尼西亚\n(纽约)\n黎巴嫩\n巴基斯坦\n阿根廷\n喀麦隆\n墨西哥'
+ '\n巴西\n塞内加尔\n莫桑比克\n埃及(纽约)\n大韩民国\n印度\n斐济\n智利\n'
+ '中国\n纳米比亚\n小计\n财务委员会\n缅甸\n乌干达\n牙买加\n印度(纽约)\n尼'
+ '日利亚\n总计\n注:估计费用表表明每年资助每个机构一次会议需要经费120 000'
+ '美元至215 000美元(四舍五入)。'
+ },
+ {
+ 'text':
+ '第九届会议\n时间:2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
+ '律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1. 国际'
+ '海底管理局大会第八届会议请秘书长采取一项临时措施,设立一个自愿信托基金,'
+ '以便支付来自发展中国家的法律和技术委员会成员以及来自发展中国家的财务委员'
+ '会成员参加委员会会议的费用。\n2. 由于秘书长向会员国发出为该信托基金捐款'
+ '的请求,已收到三笔捐款,共计10 500美元。 管理局已为基金设立一个单独的账'
+ '户。\n3. 管理局第八届会议还决定,由财务委员会审查资助参加这两个委员会会'
+ '议的方式,包括审查是否可能从管理局行政预算中提供经费。\n4. 自愿信托基金'
+ '迄今收到的捐款数额很小。 这两个委员会成员虽然由缔约国提名,但他们以个人身'
+ '份当选。 因此,必须确保这些机构的成员在任期内能够参加会议并且持续不断地履'
+ '行职务。 现已注意到,这两个委员会若干成员因旅费和生活津贴费用方面有困难而'
+ '未能出席会议。 来自发展中国家成员参加会议的费用估计数见附件,其中比较了经'
+ '济舱和公务舱机票价格以及适用于金斯敦的每日生活津贴费用。 从表中可以看出,'
+ '根据不同的人数、机舱等级和会议持续时间,每年平均需要捐款120 000美元至'
+ '215 000美元。\n5. 为了指导委员会确定提供经费的方式,对某些国际组织的现'
+ '行办法作了一次简要调查。 为支付参加会议的旅费和生活费而设立信托基金最相关'
+ '的实例是2000年大会为来自发展中国家的大陆架界限委员会成员设立的自愿信托基'
+ '金。 目前这一基金正在运作,但现有资源有限。 联合国制定的程序表明,委员会'
+ '成员的政府应在规定时间内尽可能提前提出请求。 这种请求按照先到先核可的办法'
+ '处理。 提供的机票将是最直接路线的经济舱机票,每日生活津贴将按照联合国费率'
+ '提供。 购买机票的所有安排均由联合国秘书处执行。\n6. 虽然已经设立了临时性'
+ '的自愿信托基金,但是,对该基金的捐款数额很小,捐款速度很慢。 因此,除了对'
+ '信托基金提供自愿捐款的办法之外,建议委员会还可以考虑采用下列办法:\n(a) '
+ '从管理局一般行政经费累计利息中拨出一定数额的经费;\n(b) 每年从上一年预算'
+ '未动用部分中拨出规定的数额;\n(c) 从先驱投资者基金利息中拨出规定的数额。'
+ '\n7. 委员会还不妨建议由管理局秘书处依照行政规则和程序管理该基金,并向财'
+ '务委员会提出一份报告。\n附件\n资助来自发展中国家的法律和技术委员会以及财'
+ '务\n委员会成员出席会议的指示性费用(美元)\n成员\n机票\n机场\n费用\n金'
+ '斯敦每日生活\n津贴\n转机途中每日生活\n7日\n共计\n14日\n经济舱\n公务舱'
+ '\n7天=(8天每日生活\n津贴)\n14天= (15天每日生活津贴)\n商务舱\n法律和技'
+ '术委员会\n印度尼西亚\n(纽约)\n黎巴嫩\n巴基斯坦\n阿根廷\n喀麦隆\n墨西哥'
+ '\n巴西\n塞内加尔\n莫桑比克\n埃及(纽约)\n大韩民国\n印度\n斐济\n智利\n'
+ '中国\n纳米比亚\n小计\n财务委员会\n缅甸\n乌干达\n牙买加\n印度(纽约)\n尼'
+ '日利亚\n总计\n注:估计费用表表明每年资助每个机构一次会议需要经费120 000'
+ '美元至215 000美元(四舍五入)。'
+ },
+ ]
+ tgt_list = [
+ {
+ 'text': '你好,请问你是谁'
+ },
+ {
+ 'text': '欢迎来到阿里巴巴!'
+ },
+ {
+ 'text':
+ '第九届会议\n2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
+ '律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1. 国际'
+ '海底管理局大会第八届会议请秘书长采取一项临时措施,设立一个自愿信托基金,'
+ '以便支付来自发展中国家的法律和技术委员会成员以及来自发展中国家的财务委员'
+ '会成员参加委员会会议的费用。\n2. 由于秘书长向会员国发出为该信托基金捐款'
+ '的请求,已收到三笔捐款,共计10 500美元。 管理局已为基金设立一个单独的账'
+ '户。\n3. 管理局第八届会议还决定,由财务委员会审查资助参加这两个委员会会'
+ '议的方式,包括审查是否可能从管理局行政预算中提供经费。\n4. 自愿信托基金'
+ '迄今收到的捐款数额很小。 这两个委员会成员虽然由缔约国提名,但他们以个人身'
+ '份当选。 因此,必须确保这些机构的成员在任期内能够参加会议并且持续不断地履'
+ '行职务。 现已注意到,这两个委员会若干成员因旅费和生活津贴费用方面有困难而'
+ '未能出席会议。 来自发展中国家成员参加会议的费用估计数见附件,其中比较了经'
+ '济舱和公务舱机票价格以及适用于金斯敦的每日生活津贴费用。 从表中可以看出,'
+ '根据不同的人数、机舱等级和会议持续时间,每年平均需要捐款120 000美元至'
+ '215 000美元。\n5. 为了指导委员会确定提供经费的方式,对某些国际组织的现'
+ '行办法作了一次简要调查。 为支付参加会议的旅费和生活费而设立信托基金最相关'
+ '的实例是2000年大会为来自发展中国家的大陆架界限委员会成员设立的自愿信托基'
+ '金。 目前这一基金正在运作,但现有资源有限。 联合国制定的程序表明,委员会'
+ '成员的政府应在规定时间内尽可能提前提出请求。 这种请求按照先到先核可的办法'
+ '处理。 提供的机票将是最直接路线的经济舱机票,每日生活津贴将按照联合国费率'
+ '提供。 购买机票的所有安排均由联合国秘书处执行。\n6. 虽然已经设立了临时性'
+ '的自愿信托基金,但是,对该基金的捐款数额很小,捐款速度很慢。 因此,除了对'
+ '信托基金提供自愿捐款的办法之外,建议委员会还可以考虑采用下列办法:\n(a) '
+ '从管理局一般行政经费累计利息中拨出一定数额的经费;\n(b) 每年从上一年预算'
+ '未动用部分中拨出规定的数额;\n(c) 从先驱投资者基金利息中拨出规定的数额。'
+ '\n7. 委员会还不妨建议由管理局秘书处依照行政规则和程序管理该基金,并向财'
+ '务委员会提出一份报告。\n附件\n资助来自发展中国家的法律和技术委员会以及财'
+ '务\n委员会成员出席会议的指示性费用(美元)\n成员\n机票\n机场\n费用\n金'
+ '斯敦每日生活\n津贴\n转机途中每日生活\n7日\n共计\n14日\n经济舱\n公务舱'
+ '\n7天=(8天每日生活\n津贴)\n14天= (15天每日生活津贴)\n商务舱\n法律和技'
+ '术委员会\n印度尼西亚\n(纽约)\n黎巴嫩\n巴基斯坦\n阿根廷\n喀麦隆\n墨西哥'
+ '\n巴西\n塞内加尔\n莫桑比克\n埃及(纽约)\n大韩民国\n印度\n斐济\n智利\n'
+ '中国\n纳米比亚\n小计\n财务委员会\n缅甸\n乌干达\n牙买加\n印度(纽约)\n尼'
+ '日利亚\n总计\n注:估计费用表表明每年资助每个机构一次会议需要经费120 000'
+ '美元至215 000美元(四舍五入)。'
+ },
+ ]
+ dataset = Dataset.from_list(ds_list)
+ op = DocumentMinhashDeduplicator(tokenization='character',
+ ignore_pattern=r'\p{P}')
+ self._run_minhash_dedup(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/deduplicator/test_document_simhash_deduplicator.py b/tests/ops/deduplicator/test_document_simhash_deduplicator.py
new file mode 100644
index 000000000..d021423c0
--- /dev/null
+++ b/tests/ops/deduplicator/test_document_simhash_deduplicator.py
@@ -0,0 +1,962 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.deduplicator.document_simhash_deduplicator import \
+ DocumentSimhashDeduplicator
+
+
+class DocumentSimhashDeduplicatorTest(unittest.TestCase):
+
+ def _run_simhash_dedup(self, dataset: Dataset, target_list, op):
+ dataset = dataset.map(op.compute_hash)
+ dataset, _ = op.process(dataset)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_english_deduplication(self):
+ ds_list = [
+ {
+ 'text': 'Today is Sunday and it\'s a happy day!'
+ },
+ {
+ 'text': 'Do you need a cup of coffee?'
+ },
+ {
+ 'text': 'Today is sunday and it\'s really a happy day!'
+ },
+ {
+ 'text':
+ 'This paper proposed a novel method on LLM pretraining.'
+ },
+ {
+ 'text':
+ 'Smithfield employs 3,700 people at its plant in Sioux Falls, '
+ 'South Dakota. The plant slaughters 19,500 pigs a day — 5 '
+ 'percent of U.S. pork. Most of the workers are immigrants '
+ 'from Ethiopia, Mexico, South Sudan, Honduras, Myanmar, '
+ 'Somalia, Guatemala, and other poor countries.\n\nInevitably '
+ 'workers must pass within one foot of hundreds of colleagues '
+ 'in the hallways, locker rooms, cafeterias, and cutting '
+ 'lines. The same conditions have spurred Covid-19 outbreaks '
+ 'at meat plants from Minnesota and Wisconsin to Colorado, '
+ 'Nebraska, Missouri, Iowa, Pennsylvania, North Carolina, and '
+ 'Georgia.\n\n801 workers at the Sioux Falls plant have tested '
+ 'positive, together with 206 people close to them. The '
+ 'outbreak has killed Agustín Rodríguez Martínez, aged 64, an '
+ 'employee with two decades of experience originally from El '
+ 'Salvador, and Craig Allen Franken, 61, who worked for '
+ 'Smithfield his entire adult life.\n\nThe company knew of its '
+ 'first infection on March 24 or earlier. The virus spread '
+ 'exponentially for several weeks. Ahead of Easter Sunday and '
+ 'Monday (April 12-13), Smithfield promised to “completely '
+ 'shutter” to sanitize and put up cardboard and plastic sheet '
+ 'dividers. This would not end transmission, as potentially '
+ 'hundreds of staff were already carrying the virus. But even '
+ 'during this “shutdown,” many cars were seen in the parking '
+ 'lot. The mayor admits that the company lied, and the local '
+ 'AFL-CIO alleges the plant ran 60 percent production. On '
+ 'Easter, with 238 known infections, Smithfield finally '
+ 'agreed to shut down indefinitely after a request from the '
+ 'mayor and the governor. Yet the company insisted on waiting '
+ 'three more days to actually halt production.\n\nSmithfield '
+ 'denied contributing to the outbreak, saying it took a “very '
+ 'proactive approach.” Relying on racism, the company blamed '
+ 'workers for getting themselves sick. A spokesperson said '
+ 'the outbreak was so severe because of the plant’s “large '
+ 'immigrant population,” claming “Living circumstances in '
+ 'certain cultures are different than they are with your '
+ 'traditional American family.” They slandered the workers as '
+ 'dirty, ignorant, and untrustworthy with help from governor '
+ 'Kristi Noem, who claimed, “99 percent of what’s going on '
+ 'today wasn’t happening inside the facility. It was more at '
+ 'home, where these employees were going home and spreading '
+ 'some of the virus” by living too close together.\n\nOne '
+ 'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
+ 'from South Sudan, says, “With how we work on the line, '
+ 'I would say I got sick because of them not taking safety '
+ 'measures.” His job is “really, really close” to other '
+ 'workers chopping fresh-killed pigs. “The job is so heavy. '
+ 'You have to breathe so hard.”\n\nIn early March, '
+ 'union officials requested masks, overcoats, entrance '
+ 'checking for fevers, and less crowding in 500-capacity '
+ 'cafeterias. But Smithfield waited on most safety measures '
+ 'until early April. Only April 6 did they start checking for '
+ 'fevers. Instead of protective masks, they gave out beard '
+ 'nets.\n\nSmithfield concealed infections with a policy of '
+ 'informing only employees whose work stations were in the '
+ 'same area as a person who tested positive. The fact that '
+ 'workers are required to move around was willfully ignored. '
+ 'One worker who tested positive said, “I clearly would have '
+ 'gotten it at the factory. This week I have worked on three '
+ 'different floors. I’ve eaten in two different cafeterias … '
+ 'I’ve been walking through the whole place.” Employees from '
+ 'the eighth floor of the plant were quarantined, '
+ 'but everyone else was told to keep working.\n\nWhat Is '
+ 'Really Going On?\n\nAverage plant wages are around $16 an '
+ 'hour. Smithfield never raised them. Instead, they offered '
+ '$500 to employees who could go all of April without an '
+ 'unapproved day off. The company says their “Responsibility '
+ 'Bonuses” show their “immense gratefulness” to employees '
+ '“for their selfless sacrifices.”\n\nMeanwhile, the local '
+ 'Argus Leader wrote union members wanted essential-worker '
+ 'hazard pay, which “would be considered hourly compensation '
+ 'about 1.5 or two times their normal pay.” One worker said, '
+ '“I feel like they’re bribing us with [the bonus] to come to '
+ 'work sick. That’s how you know they don’t care.”\n\nBoth '
+ 'Sioux Falls workers killed by Covid-19 were in their '
+ 'sixties. It is unconscionable that they were still working. '
+ 'All meatpackers over 50 should be on paid leave. Agustín '
+ 'Rodríguez, 64, had a rough job sawing the legs off dead '
+ 'pigs. He mopped floors with a fever shortly before he was '
+ 'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
+ 'plant, he claimed, “We have continued to run our facilities '
+ 'for one reason: to sustain our nation’s food supply.” This '
+ 'is an effort to sweep Smithfield’s abuses under the rug, '
+ 'as if the company were operating for public benefit. This '
+ 'patriotic propaganda that all Americans are in it together '
+ 'is like a drug to keep workers from getting '
+ 'organized.\n\nThe major union in the industry, including at '
+ 'Smithfield, is the United Food and Commercial Workers union '
+ '(UFCW). What union leaders have done is ultimately '
+ 'troubling.\n\nCan Workers Fight?\n\nLocal AFL-CIO president '
+ 'Kooper Caraway has publicly said management delayed safety '
+ 'action as long as possible for profit. But while some '
+ 'workers were demanding a two-week shutdown, Caraway told '
+ 'the Argus Leader that was unrealistic because the '
+ 'government considers the plant essential. He suggested the '
+ 'union would be happy with minimal safety measures: “Even if '
+ '10 people get exposed in a day rather than 11. If you can '
+ 'implement a program where even one or two less people get '
+ 'exposed during a shift, that’s one or two less people.” Of '
+ 'course reducing infections is good, but suggesting workers '
+ 'would be satisfied if the company allowed 90% of the '
+ 'contagion to continue is horrifying.\n\nThe response of '
+ 'UFCW leadership was worse. As the disease was exploding, '
+ 'they told the Argus Leader, “We applaud [Smithfield’s] '
+ 'decision to temporarily close the plant [over Easter '
+ 'weekend] to push for an even safer work environment.” What '
+ 'does “even safer” mean in this context?\n\nThe union '
+ 'bureaucracy has taken weak action elsewhere. In '
+ 'Pennsylvania, the UFCW negotiated $2 hazard pay for two '
+ 'months with Cargill Meat — the same pandemic premium Amazon '
+ 'gave workers without a union. In Nebraska, the UFCW '
+ 'negotiated $4 hazard pay for one month with meat giant '
+ 'JBS.\n\nThe union has said nothing about forcing companies '
+ 'to send older workers home with pay, even though a '
+ '70-year-old shop steward and a 78-year-old grandfather '
+ 'working at JBS plants were killed by Covid-19. Smithfield '
+ 'workers were promised only two weeks of shutdown pay. For '
+ 'many, this compensation is half their normal paycheck '
+ 'because they routinely put in 66 hour weeks — overtime that '
+ 'costs exhaustion and chronic pain.\n\nUnion officials '
+ 'endeavor to cooperate with the meat companies. An Iowa UFCW '
+ 'president actually suggested it might be impossible for '
+ 'plants to move workers a full six feet apart and told the '
+ 'Des Moines Register, “We can’t stop the plants. If we stop '
+ 'the plants from running, we stop feeding the country. We '
+ 'want to do everything we can to make sure the employees are '
+ 'safe to keep the plant running.”\n\nEvery part of this '
+ 'explanation directly overlaps with what the Smithfield CEO '
+ 'said. Unfortunately, it amounts to accepting the company’s '
+ 'excuses.\n\nThey claim that workers who do hard physical '
+ 'labor, waking up at 4 a.m. and often working six days a '
+ 'week for years, would be guilty of taking food away from '
+ 'the people and hurting America if they dared to fight for '
+ 'their human needs. But nothing is said about the company '
+ 'raking in profits and even murdering workers to increase '
+ 'them.\n\nSmithfield’s parent company W.H. Group, '
+ 'which slaughters around 30 million pigs per year in plants '
+ 'in both the United States and China, saw its profits '
+ 'skyrocket by about one third in 2019 to $1.38 billion. It '
+ 'is disturbing that UFCW officials do not bring up these '
+ 'soaring profits in their response to the outbreaks. Reuters '
+ 'published a report on the corporation’s financial success '
+ 'in late March. The head of W.H. Group had touted to the '
+ 'media that it got through the pandemic in China with very '
+ 'limited impact on production.\n\nIt is true that many '
+ 'Smithfield workers are reasonably afraid for their jobs and '
+ 'want to keep working. A 25-year-old employee explained, '
+ '“I have a lot of bills. My baby’s coming soon — I have to '
+ 'work.” At the same time, he was afraid of infecting his '
+ 'pregnant wife. His spouse, a former employee, '
+ 'said bitterly, “Smithfield— they don’t care about '
+ 'employees. They only care about their money.”\n\nWorkers '
+ 'are pressured in these two painful directions. Nonetheless, '
+ 'work can mean solidarity. Before Smithfield even checked '
+ 'temperatures, there was a “sick-out” strike without union '
+ 'support by 800 to 1,000 workers at a JBS meat factory in '
+ 'Colorado. Hundreds of workers also called in sick days at a '
+ 'Nebraska JBS plant.\n\nTrade union leaders won’t even '
+ 'whisper the word “strike” when thousands of workers are '
+ 'thinking about it. They are limiting themselves to polite '
+ 'requests. We need a workers’ movement that asks who '
+ 'controls the factory, that threatens to disrupt the bosses’ '
+ 'profits, and that allows workers to use their immense power '
+ '— this could change the meat industry and the world. '
+ },
+ {
+ 'text':
+ 'Smithfield employs 3,700 people at its plants in Sioux '
+ 'Falls, South Dakota. The plant slaughters 19,500 pig a day '
+ '— 5 percent of U.S. pork. Most of the workers are '
+ 'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
+ 'Myanmar, Somalia, Guatemala, and other poor '
+ 'countries.\n\nInevitably workers must pass within one foot '
+ 'of hundreds of colleagues in the hallways, locker rooms, '
+ 'cafeterias, and cutting lines. The same conditions have '
+ 'spurred Covid-19 outbreaks at meat plants from Minnesota '
+ 'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
+ 'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
+ 'at the Sioux Falls plant have tested positive, together '
+ 'with 206 people close to them. The outbreak has killed '
+ 'Agustín Rodríguez Martínez, aged 64, an employee with two '
+ 'decades of experience originally from El Salvador, '
+ 'and Craig Allen Franken, 61, who worked for Smithfield his '
+ 'entire adult life.\n\nThe company knew of its first '
+ 'infection on March 24 or earlier. The virus spread '
+ 'exponentially for several weeks. Ahead of Easter Sunday and '
+ 'Monday (April 12-13), Smithfield promised to “completely '
+ 'shutter” to sanitize and put up cardboard and plastic sheet '
+ 'dividers. This would not end transmission, as potentially '
+ 'hundreds of staff were already carrying the virus. But even '
+ 'during this “shutdown,” many cars were seen in the parking '
+ 'lot. The mayor admits that the company lied, and the local '
+ 'AFL-CIO alleges the plant ran 60 percent production. On '
+ 'Easter, with 238 known infections, Smithfield finally '
+ 'agreed to shut down indefinitely after a request from the '
+ 'mayor and the governor. Yet the company insisted on waiting '
+ 'three more days to actually halt production.\n\nSmithfield '
+ 'denied contributing to the outbreak, saying it took a “very '
+ 'proactive approach.” Relying on racism, the company blamed '
+ 'workers for getting themselves sick. A spokesperson said '
+ 'the outbreak was so severe because of the plant’s “large '
+ 'immigrant population,” claming “Living circumstances in '
+ 'certain cultures are different than they are with your '
+ 'traditional American family.” They slandered the workers as '
+ 'dirty, ignorant, and untrustworthy with help from governor '
+ 'Kristi Noem, who claimed, “99 percent of what’s going on '
+ 'today wasn’t happening inside the facility. It was more at '
+ 'home, where these employees were going home and spreading '
+ 'some of the virus” by living too close together.\n\nOne '
+ 'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
+ 'from South Sudan, says, “With how we work on the line, '
+ 'I would say I got sick because of them not taking safety '
+ 'measures.” His job is “really, really close” to other '
+ 'workers chopping fresh-killed pigs. “The job is so heavy. '
+ 'You have to breathe so hard.”\n\nIn early March, '
+ 'union officials requested masks, overcoats, entrance '
+ 'checking for fevers, and less crowding in 500-capacity '
+ 'cafeterias. But Smithfield waited on most safety measures '
+ 'until early April. Only April 6 did they start checking for '
+ 'fevers. Instead of protective masks, they gave out beard '
+ 'nets.\n\nSmithfield concealed infections with a policy of '
+ 'informing only employees whose work stations were in the '
+ 'same area as a person who tested positive. The fact that '
+ 'workers are required to move around was willfully ignored. '
+ 'One worker who tested positive said, “I clearly would have '
+ 'gotten it at the factory. This week I have worked on three '
+ 'different floors. I’ve eaten in two different cafeterias … '
+ 'I’ve been walking through the whole place.” Employees from '
+ 'the eighth floor of the plant were quarantined, '
+ 'but everyone else was told to keep working.\n\nWhat Is '
+ 'Really Going On?\n\nAverage plant wages are around $16 an '
+ 'hour. Smithfield never raised them. Instead, they offered '
+ '$500 to employees who could go all of April without an '
+ 'unapproved day off. The company says their “Responsibility '
+ 'Bonuses” show their “immense gratefulness” to employees '
+ '“for their selfless sacrifices.”\n\nMeanwhile, the local '
+ 'Argus Leader wrote union members wanted essential-worker '
+ 'hazard pay, which “would be considered hourly compensation '
+ 'about 1.5 or two times their normal pay.” One worker said, '
+ '“I feel like they’re bribing us with [the bonus] to come to '
+ 'work sick. That’s how you know they don’t care.”\n\nBoth '
+ 'Sioux Falls workers killed by Covid-19 were in their '
+ 'sixties. It is unconscionable that they were still working. '
+ 'All meatpackers over 50 should be on paid leave. Agustín '
+ 'Rodríguez, 64, had a rough job sawing the legs off dead '
+ 'pigs. He mopped floors with a fever shortly before he was '
+ 'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
+ 'plant, he claimed, “We have continued to run our facilities '
+ 'for one reason: to sustain our nation’s food supply.” This '
+ 'is an effort to sweep Smithfield’s abuses under the rug, '
+ 'as if the company were operating for public benefit. This '
+ 'patriotic propaganda that all Americans are in it together '
+ 'is like a drug to keep workers from getting '
+ 'organized.\n\nThe major union in the industry, including at '
+ 'Smithfield, is the United Food and Commercial Workers union '
+ '(UFCW). What union leaders have done is ultimately '
+ 'troubling.\n\nCan Workers Fight?\n\nLocal AFL-CIO president '
+ 'Kooper Caraway has publicly said management delayed safety '
+ 'action as long as possible for profit. But while some '
+ 'workers were demanding a two-week shutdown, Caraway told '
+ 'the Argus Leader that was unrealistic because the '
+ 'government considers the plant essential. He suggested the '
+ 'union would be happy with minimal safety measures: “Even if '
+ '10 people get exposed in a day rather than 11. If you can '
+ 'implement a program where even one or two less people get '
+ 'exposed during a shift, that’s one or two less people.” Of '
+ 'course reducing infections is good, but suggesting workers '
+ 'would be satisfied if the company allowed 90% of the '
+ 'contagion to continue is horrifying.\n\nThe response of '
+ 'UFCW leadership was worse. As the disease was exploding, '
+ 'they told the Argus Leader, “We applaud [Smithfield’s] '
+ 'decision to temporarily close the plant [over Easter '
+ 'weekend] to push for an even safer work environment.” What '
+ 'does “even safer” mean in this context?\n\nThe union '
+ 'bureaucracy has taken weak action elsewhere. In '
+ 'Pennsylvania, the UFCW negotiated $2 hazard pay for two '
+ 'months with Cargill Meat — the same pandemic premium Amazon '
+ 'gave workers without a union. In Nebraska, the UFCW '
+ 'negotiated $4 hazard pay for one month with meat giant '
+ 'JBS.\n\nThe union has said nothing about forcing companies '
+ 'to send older workers home with pay, even though a '
+ '70-year-old shop steward and a 78-year-old grandfather '
+ 'working at JBS plants were killed by Covid-19. Smithfield '
+ 'workers were promised only two weeks of shutdown pay. For '
+ 'many, this compensation is half their normal paycheck '
+ 'because they routinely put in 66 hour weeks — overtime that '
+ 'costs exhaustion and chronic pain.\n\nUnion officials '
+ 'endeavor to cooperate with the meat companies. An Iowa UFCW '
+ 'president actually suggested it might be impossible for '
+ 'plants to move workers a full six feet apart and told the '
+ 'Des Moines Register, “We can’t stop the plants. If we stop '
+ 'the plants from running, we stop feeding the country. We '
+ 'want to do everything we can to make sure the employees are '
+ 'safe to keep the plant running.”\n\nEvery part of this '
+ 'explanation directly overlaps with what the Smithfield CEO '
+ 'said. Unfortunately, it amounts to accepting the company’s '
+ 'excuses.\n\nThey claim that workers who do hard physical '
+ 'labor, waking up at 4 a.m. and often working six days a '
+ 'week for years, would be guilty of taking food away from '
+ 'the people and hurting America if they dared to fight for '
+ 'their human needs. But nothing is said about the company '
+ 'raking in profits and even murdering workers to increase '
+ 'them.\n\nSmithfield’s parent company W.H. Group, '
+ 'which slaughters around 30 million pigs per year in plants '
+ 'in both the United States and China, saw its profits '
+ 'skyrocket by about one third in 2019 to $1.38 billion. It '
+ 'is disturbing that UFCW officials do not bring up these '
+ 'soaring profits in their response to the outbreaks. Reuters '
+ 'published a report on the corporation’s financial success '
+ 'in late March. The head of W.H. Group had touted to the '
+ 'media that it got through the pandemic in China with very '
+ 'limited impact on production.\n\nIt is true that many '
+ 'Smithfield workers are reasonably afraid for their jobs and '
+ 'want to keep working. A 25-year-old employee explained, '
+ '“I have a lot of bills. My baby’s coming soon — I have to '
+ 'work.” At the same time, he was afraid of infecting his '
+ 'pregnant wife. His spouse, a former employee, '
+ 'said bitterly, “Smithfield— they don’t care about '
+ 'employees. They only care about their money.”\n\nWorkers '
+ 'are pressured in these two painful directions. Nonetheless, '
+ 'work can mean solidarity. Before Smithfield even checked '
+ 'temperatures, there was a “sick-out” strike without union '
+ 'support by 800 to 1,000 workers at a JBS meat factory in '
+ 'Colorado. Hundreds of workers also called in sick days at a '
+ 'Nebraska JBS plant.\n\nTrade union leaders won’t even '
+ 'whisper the word “strike” when thousands of workers are '
+ 'thinking about it. They are limiting themselves to polite '
+ 'requests. We need a workers’ movement that asks who '
+ 'controls the factory, that threatens to disrupt the bosses’ '
+ 'profits, and that allows workers to use their immense power '
+ '— this could change the meat industry and the world. '
+ },
+ {
+ 'text':
+ 'Smithfield employs 3,700 people at its plant in Sioux '
+ 'Falls, South Dakota. The plant slaughters 19,500 pigs a day '
+ '— 5 percent of U.S. pork. Most of the workers are '
+ 'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
+ 'Myanmar, Somalia, Guatemala, and other poor '
+ 'countries.\n\nInevitably workers must pass within one foot '
+ 'of hundreds of colleagues in the hallways, locker rooms, '
+ 'cafeterias, and cutting lines. The same conditions have '
+ 'spurred Covid-19 outbreaks at meat plants from Minnesota '
+ 'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
+ 'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
+ 'at the Sioux Falls plant have tested positive, together '
+ 'with 206 people close to them. The outbreak has killed '
+ 'Agustín Rodríguez Martínez, aged 64, an employee with two '
+ 'decades of experience originally from El Salvador, '
+ 'and Craig Allen Franken, 61, who worked for Smithfield his '
+ 'entire adult life.\n\nThe company knew of its first '
+ 'infection on March 24 or earlier. The virus spread '
+ 'exponentially for several weeks. Ahead of Easter Sunday and '
+ 'Monday (April 12-13), Smithfield promised to “completely '
+ 'shutter” to sanitize and put up cardboard and plastic sheet '
+ 'dividers. This would not end transmission, as potentially '
+ 'hundreds of staff were already carrying the virus. But even '
+ 'during this “shutdown,” many cars were seen in the parking '
+ 'lot. The mayor admits that the company lied, and the local '
+ 'AFL-CIO alleges the plant ran 60 percent production. On '
+ 'Easter, with 238 known infections, Smithfield finally '
+ 'agreed to shut down indefinitely after a request from the '
+ 'mayor and the governor. Yet the company insisted on waiting '
+ 'three more days to actually halt production.\n\nSmithfield '
+ 'denied contributing to the outbreak, saying it took a “very '
+ 'proactive approach.” Relying on racism, the company blamed '
+ 'workers for getting themselves sick. A spokesperson said '
+ 'the outbreak was so severe because of the plant’s “large '
+ 'immigrant population,” claming “Living circumstances in '
+ 'certain cultures are different than they are with your '
+ 'traditional American family.” They slandered the workers as '
+ 'dirty, ignorant, and untrustworthy with help from governor '
+ 'Kristi Noem, who claimed, “99 percent of what’s going on '
+ 'today wasn’t happening inside the facility. It was more at '
+ 'home, where these employees were going home and spreading '
+ 'some of the virus” by living too close together.\n\nOne '
+ 'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
+ 'from South Sudan, says, “With how we work on the line, '
+ 'I would say I got sick because of them not taking safety '
+ 'measures.” His job is “really, really close” to other '
+ 'workers chopping fresh-killed pigs. “The job is so heavy. '
+ 'You have to breathe so hard.”\n\nIn early March, '
+ 'union officials requested masks, overcoats, entrance '
+ 'checking for fevers, and less crowding in 500-capacity '
+ 'cafeterias. But Smithfield waited on most safety measures '
+ 'until early April. Only April 6 did they start checking for '
+ 'fevers. Instead of protective masks, they gave out beard '
+ 'nets.\n\nSmithfield concealed infections with a policy of '
+ 'informing only employees whose work stations were in the '
+ 'same area as a person who tested positive. The fact that '
+ 'workers are required to move around was willfully ignored. '
+ 'One worker who tested positive said, “I clearly would have '
+ 'gotten it at the factory. This week I have worked on three '
+ 'different floors. I’ve eaten in two different cafeterias … '
+ 'I’ve been walking through the whole place.” Employees from '
+ 'the eighth floor of the plant were quarantined, '
+ 'but everyone else was told to keep working.\n\nWhat Is '
+ 'Really Going On?\n\nAverage plant wages are around $16 an '
+ 'hour. Smithfield never raised them. Instead, they offered '
+ '$500 to employees who could go all of April without an '
+ 'unapproved day off. The company says their “Responsibility '
+ 'Bonuses” show their “immense gratefulness” to employees '
+ '“for their selfless sacrifices.”\n\nMeanwhile, the local '
+ 'Argus Leader wrote union members wanted essential-worker '
+ 'hazard pay, which “would be considered hourly compensation '
+ 'about 1.5 or two times their normal pay.” One worker said, '
+ '“I feel like they’re bribing us with [the bonus] to come to '
+ 'work sick. That’s how you know they don’t care.”\n\nBoth '
+ 'Sioux Falls workers killed by Covid-19 were in their '
+ 'sixties. It is unconscionable that they were still working. '
+ 'All meatpackers over 50 should be on paid leave. Agustín '
+ 'Rodríguez, 64, had a rough job sawing the legs off dead '
+ 'pigs. He mopped floors with a fever shortly before he was '
+ 'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
+ 'plant, he claimed, “We have continued to run our facilities '
+ 'for one reason: to sustain our nation’s food supply.” This '
+ 'is an effort to sweep Smithfield’s abuses under the rug, '
+ 'as if the company were operating for public benefit. This '
+ 'patriotic propaganda that all Americans are in it together '
+ 'is like a drug to keep workers from getting organized. '
+ },
+ {
+ 'text':
+ 'Smithfield employs 3,700 people at its plants in Sioux '
+ 'Falls, South Dakota. The plant slaughters 19,500 pig a day '
+ '— 5 percent of U.S. pork. Most of the workers are '
+ 'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
+ 'Myanmar, Somalia, Guatemala, and other poor '
+ 'countries.\n\nInevitably workers must pass within one foot '
+ 'of hundreds of colleagues in the hallways, locker rooms, '
+ 'cafeterias, and cutting lines. The same conditions have '
+ 'spurred Covid-19 outbreaks at meat plants from Minnesota '
+ 'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
+ 'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
+ 'at the Sioux Falls plant have tested positive, together '
+ 'with 206 people close to them. The outbreak has killed '
+ 'Agustín Rodríguez Martínez, aged 64, an employee with two '
+ 'decades of experience originally from El Salvador, '
+ 'and Craig Allen Franken, 61, who worked for Smithfield his '
+ 'entire adult life.\n\nThe company knew of its first '
+ 'infection on March 24 or earlier. The virus spread '
+ 'exponentially for several weeks. Ahead of Easter Sunday and '
+ 'Monday (April 12-13), Smithfield promised to “completely '
+ 'shutter” to sanitize and put up cardboard and plastic sheet '
+ 'dividers. This would not end transmission, as potentially '
+ 'hundreds of staff were already carrying the virus. But even '
+ 'during this “shutdown,” many cars were seen in the parking '
+ 'lot. The mayor admits that the company lied, and the local '
+ 'AFL-CIO alleges the plant ran 60 percent production. On '
+ 'Easter, with 238 known infections, Smithfield finally '
+ 'agreed to shut down indefinitely after a request from the '
+ 'mayor and the governor. Yet the company insisted on waiting '
+ 'three more days to actually halt production.\n\nSmithfield '
+ 'denied contributing to the outbreak, saying it took a “very '
+ 'proactive approach.” Relying on racism, the company blamed '
+ 'workers for getting themselves sick. A spokesperson said '
+ 'the outbreak was so severe because of the plant’s “large '
+ 'immigrant population,” claming “Living circumstances in '
+ 'certain cultures are different than they are with your '
+ 'traditional American family.” They slandered the workers as '
+ 'dirty, ignorant, and untrustworthy with help from governor '
+ 'Kristi Noem, who claimed, “99 percent of what’s going on '
+ 'today wasn’t happening inside the facility. It was more at '
+ 'home, where these employees were going home and spreading '
+ 'some of the virus” by living too close together.\n\nOne '
+ 'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
+ 'from South Sudan, says, “With how we work on the line, '
+ 'I would say I got sick because of them not taking safety '
+ 'measures.” His job is “really, really close” to other '
+ 'workers chopping fresh-killed pigs. “The job is so heavy. '
+ 'You have to breathe so hard.”\n\nIn early March, '
+ 'union officials requested masks, overcoats, entrance '
+ 'checking for fevers, and less crowding in 500-capacity '
+ 'cafeterias. But Smithfield waited on most safety measures '
+ 'until early April. Only April 6 did they start checking for '
+ 'fevers. Instead of protective masks, they gave out beard '
+ 'nets.\n\nSmithfield concealed infections with a policy of '
+ 'informing only employees whose work stations were in the '
+ 'same area as a person who tested positive. The fact that '
+ 'workers are required to move around was willfully ignored. '
+ 'One worker who tested positive said, “I clearly would have '
+ 'gotten it at the factory. This week I have worked on three '
+ 'different floors. I’ve eaten in two different cafeterias … '
+ 'I’ve been walking through the whole place.” Employees from '
+ 'the eighth floor of the plant were quarantined, '
+ 'but everyone else was told to keep working.\n\nWhat Is '
+ 'Really Going On?\n\nAverage plant wages are around $16 an '
+ 'hour. Smithfield never raised them. Instead, they offered '
+ '$500 to employees who could go all of April without an '
+ 'unapproved day off. The company says their “Responsibility '
+ 'Bonuses” show their “immense gratefulness” to employees '
+ '“for their selfless sacrifices.”\n\nMeanwhile, the local '
+ 'Argus Leader wrote union members wanted essential-worker '
+ 'hazard pay, which “would be considered hourly compensation '
+ 'about 1.5 or two times their normal pay.” One worker said, '
+ '“I feel like they’re bribing us with [the bonus] to come to '
+ 'work sick. That’s how you know they don’t care.”\n\nBoth '
+ 'Sioux Falls workers killed by Covid-19 were in their '
+ 'sixties. It is unconscionable that they were still working. '
+ 'All meatpackers over 50 should be on paid leave. Agustín '
+ 'Rodríguez, 64, had a rough job sawing the legs off dead '
+ 'pigs. He mopped floors with a fever shortly before he was '
+ 'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
+ 'plant, he claimed, “We have continued to run our facilities '
+ 'for one reason: to sustain our nation’s food supply.” This '
+ 'is an effort to sweep Smithfield’s abuses under the rug, '
+ 'as if the company were operating for public benefit. This '
+ 'patriotic propaganda that all Americans are in it together '
+ 'is like a drug to keep workers from getting organized. '
+ },
+ {
+ 'text':
+ 'This paper proposed a novel method on LLM pretraining.'
+ },
+ ]
+ tgt_list = [
+ {
+ 'text': 'Today is Sunday and it\'s a happy day!'
+ },
+ {
+ 'text': 'Do you need a cup of coffee?'
+ },
+ {
+ 'text': 'Today is sunday and it\'s really a happy day!'
+ },
+ {
+ 'text':
+ 'This paper proposed a novel method on LLM pretraining.'
+ },
+ {
+ 'text':
+ 'Smithfield employs 3,700 people at its plant in Sioux '
+ 'Falls, South Dakota. The plant slaughters 19,500 pigs a day '
+ '— 5 percent of U.S. pork. Most of the workers are '
+ 'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
+ 'Myanmar, Somalia, Guatemala, and other poor '
+ 'countries.\n\nInevitably workers must pass within one foot '
+ 'of hundreds of colleagues in the hallways, locker rooms, '
+ 'cafeterias, and cutting lines. The same conditions have '
+ 'spurred Covid-19 outbreaks at meat plants from Minnesota '
+ 'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
+ 'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
+ 'at the Sioux Falls plant have tested positive, together '
+ 'with 206 people close to them. The outbreak has killed '
+ 'Agustín Rodríguez Martínez, aged 64, an employee with two '
+ 'decades of experience originally from El Salvador, '
+ 'and Craig Allen Franken, 61, who worked for Smithfield his '
+ 'entire adult life.\n\nThe company knew of its first '
+ 'infection on March 24 or earlier. The virus spread '
+ 'exponentially for several weeks. Ahead of Easter Sunday and '
+ 'Monday (April 12-13), Smithfield promised to “completely '
+ 'shutter” to sanitize and put up cardboard and plastic sheet '
+ 'dividers. This would not end transmission, as potentially '
+ 'hundreds of staff were already carrying the virus. But even '
+ 'during this “shutdown,” many cars were seen in the parking '
+ 'lot. The mayor admits that the company lied, and the local '
+ 'AFL-CIO alleges the plant ran 60 percent production. On '
+ 'Easter, with 238 known infections, Smithfield finally '
+ 'agreed to shut down indefinitely after a request from the '
+ 'mayor and the governor. Yet the company insisted on waiting '
+ 'three more days to actually halt production.\n\nSmithfield '
+ 'denied contributing to the outbreak, saying it took a “very '
+ 'proactive approach.” Relying on racism, the company blamed '
+ 'workers for getting themselves sick. A spokesperson said '
+ 'the outbreak was so severe because of the plant’s “large '
+ 'immigrant population,” claming “Living circumstances in '
+ 'certain cultures are different than they are with your '
+ 'traditional American family.” They slandered the workers as '
+ 'dirty, ignorant, and untrustworthy with help from governor '
+ 'Kristi Noem, who claimed, “99 percent of what’s going on '
+ 'today wasn’t happening inside the facility. It was more at '
+ 'home, where these employees were going home and spreading '
+ 'some of the virus” by living too close together.\n\nOne '
+ 'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
+ 'from South Sudan, says, “With how we work on the line, '
+ 'I would say I got sick because of them not taking safety '
+ 'measures.” His job is “really, really close” to other '
+ 'workers chopping fresh-killed pigs. “The job is so heavy. '
+ 'You have to breathe so hard.”\n\nIn early March, '
+ 'union officials requested masks, overcoats, entrance '
+ 'checking for fevers, and less crowding in 500-capacity '
+ 'cafeterias. But Smithfield waited on most safety measures '
+ 'until early April. Only April 6 did they start checking for '
+ 'fevers. Instead of protective masks, they gave out beard '
+ 'nets.\n\nSmithfield concealed infections with a policy of '
+ 'informing only employees whose work stations were in the '
+ 'same area as a person who tested positive. The fact that '
+ 'workers are required to move around was willfully ignored. '
+ 'One worker who tested positive said, “I clearly would have '
+ 'gotten it at the factory. This week I have worked on three '
+ 'different floors. I’ve eaten in two different cafeterias … '
+ 'I’ve been walking through the whole place.” Employees from '
+ 'the eighth floor of the plant were quarantined, '
+ 'but everyone else was told to keep working.\n\nWhat Is '
+ 'Really Going On?\n\nAverage plant wages are around $16 an '
+ 'hour. Smithfield never raised them. Instead, they offered '
+ '$500 to employees who could go all of April without an '
+ 'unapproved day off. The company says their “Responsibility '
+ 'Bonuses” show their “immense gratefulness” to employees '
+ '“for their selfless sacrifices.”\n\nMeanwhile, the local '
+ 'Argus Leader wrote union members wanted essential-worker '
+ 'hazard pay, which “would be considered hourly compensation '
+ 'about 1.5 or two times their normal pay.” One worker said, '
+ '“I feel like they’re bribing us with [the bonus] to come to '
+ 'work sick. That’s how you know they don’t care.”\n\nBoth '
+ 'Sioux Falls workers killed by Covid-19 were in their '
+ 'sixties. It is unconscionable that they were still working. '
+ 'All meatpackers over 50 should be on paid leave. Agustín '
+ 'Rodríguez, 64, had a rough job sawing the legs off dead '
+ 'pigs. He mopped floors with a fever shortly before he was '
+ 'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
+ 'plant, he claimed, “We have continued to run our facilities '
+ 'for one reason: to sustain our nation’s food supply.” This '
+ 'is an effort to sweep Smithfield’s abuses under the rug, '
+ 'as if the company were operating for public benefit. This '
+ 'patriotic propaganda that all Americans are in it together '
+ 'is like a drug to keep workers from getting '
+ 'organized.\n\nThe major union in the industry, including at '
+ 'Smithfield, is the United Food and Commercial Workers union '
+ '(UFCW). What union leaders have done is ultimately '
+ 'troubling.\n\nCan Workers Fight?\n\nLocal AFL-CIO president '
+ 'Kooper Caraway has publicly said management delayed safety '
+ 'action as long as possible for profit. But while some '
+ 'workers were demanding a two-week shutdown, Caraway told '
+ 'the Argus Leader that was unrealistic because the '
+ 'government considers the plant essential. He suggested the '
+ 'union would be happy with minimal safety measures: “Even if '
+ '10 people get exposed in a day rather than 11. If you can '
+ 'implement a program where even one or two less people get '
+ 'exposed during a shift, that’s one or two less people.” Of '
+ 'course reducing infections is good, but suggesting workers '
+ 'would be satisfied if the company allowed 90% of the '
+ 'contagion to continue is horrifying.\n\nThe response of '
+ 'UFCW leadership was worse. As the disease was exploding, '
+ 'they told the Argus Leader, “We applaud [Smithfield’s] '
+ 'decision to temporarily close the plant [over Easter '
+ 'weekend] to push for an even safer work environment.” What '
+ 'does “even safer” mean in this context?\n\nThe union '
+ 'bureaucracy has taken weak action elsewhere. In '
+ 'Pennsylvania, the UFCW negotiated $2 hazard pay for two '
+ 'months with Cargill Meat — the same pandemic premium Amazon '
+ 'gave workers without a union. In Nebraska, the UFCW '
+ 'negotiated $4 hazard pay for one month with meat giant '
+ 'JBS.\n\nThe union has said nothing about forcing companies '
+ 'to send older workers home with pay, even though a '
+ '70-year-old shop steward and a 78-year-old grandfather '
+ 'working at JBS plants were killed by Covid-19. Smithfield '
+ 'workers were promised only two weeks of shutdown pay. For '
+ 'many, this compensation is half their normal paycheck '
+ 'because they routinely put in 66 hour weeks — overtime that '
+ 'costs exhaustion and chronic pain.\n\nUnion officials '
+ 'endeavor to cooperate with the meat companies. An Iowa UFCW '
+ 'president actually suggested it might be impossible for '
+ 'plants to move workers a full six feet apart and told the '
+ 'Des Moines Register, “We can’t stop the plants. If we stop '
+ 'the plants from running, we stop feeding the country. We '
+ 'want to do everything we can to make sure the employees are '
+ 'safe to keep the plant running.”\n\nEvery part of this '
+ 'explanation directly overlaps with what the Smithfield CEO '
+ 'said. Unfortunately, it amounts to accepting the company’s '
+ 'excuses.\n\nThey claim that workers who do hard physical '
+ 'labor, waking up at 4 a.m. and often working six days a '
+ 'week for years, would be guilty of taking food away from '
+ 'the people and hurting America if they dared to fight for '
+ 'their human needs. But nothing is said about the company '
+ 'raking in profits and even murdering workers to increase '
+ 'them.\n\nSmithfield’s parent company W.H. Group, '
+ 'which slaughters around 30 million pigs per year in plants '
+ 'in both the United States and China, saw its profits '
+ 'skyrocket by about one third in 2019 to $1.38 billion. It '
+ 'is disturbing that UFCW officials do not bring up these '
+ 'soaring profits in their response to the outbreaks. Reuters '
+ 'published a report on the corporation’s financial success '
+ 'in late March. The head of W.H. Group had touted to the '
+ 'media that it got through the pandemic in China with very '
+ 'limited impact on production.\n\nIt is true that many '
+ 'Smithfield workers are reasonably afraid for their jobs and '
+ 'want to keep working. A 25-year-old employee explained, '
+ '“I have a lot of bills. My baby’s coming soon — I have to '
+ 'work.” At the same time, he was afraid of infecting his '
+ 'pregnant wife. His spouse, a former employee, '
+ 'said bitterly, “Smithfield— they don’t care about '
+ 'employees. They only care about their money.”\n\nWorkers '
+ 'are pressured in these two painful directions. Nonetheless, '
+ 'work can mean solidarity. Before Smithfield even checked '
+ 'temperatures, there was a “sick-out” strike without union '
+ 'support by 800 to 1,000 workers at a JBS meat factory in '
+ 'Colorado. Hundreds of workers also called in sick days at a '
+ 'Nebraska JBS plant.\n\nTrade union leaders won’t even '
+ 'whisper the word “strike” when thousands of workers are '
+ 'thinking about it. They are limiting themselves to polite '
+ 'requests. We need a workers’ movement that asks who '
+ 'controls the factory, that threatens to disrupt the bosses’ '
+ 'profits, and that allows workers to use their immense power '
+ '— this could change the meat industry and the world. '
+ },
+ {
+ 'text':
+ 'Smithfield employs 3,700 people at its plant in Sioux '
+ 'Falls, South Dakota. The plant slaughters 19,500 pigs a day '
+ '— 5 percent of U.S. pork. Most of the workers are '
+ 'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
+ 'Myanmar, Somalia, Guatemala, and other poor '
+ 'countries.\n\nInevitably workers must pass within one foot '
+ 'of hundreds of colleagues in the hallways, locker rooms, '
+ 'cafeterias, and cutting lines. The same conditions have '
+ 'spurred Covid-19 outbreaks at meat plants from Minnesota '
+ 'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
+ 'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
+ 'at the Sioux Falls plant have tested positive, together '
+ 'with 206 people close to them. The outbreak has killed '
+ 'Agustín Rodríguez Martínez, aged 64, an employee with two '
+ 'decades of experience originally from El Salvador, '
+ 'and Craig Allen Franken, 61, who worked for Smithfield his '
+ 'entire adult life.\n\nThe company knew of its first '
+ 'infection on March 24 or earlier. The virus spread '
+ 'exponentially for several weeks. Ahead of Easter Sunday and '
+ 'Monday (April 12-13), Smithfield promised to “completely '
+ 'shutter” to sanitize and put up cardboard and plastic sheet '
+ 'dividers. This would not end transmission, as potentially '
+ 'hundreds of staff were already carrying the virus. But even '
+ 'during this “shutdown,” many cars were seen in the parking '
+ 'lot. The mayor admits that the company lied, and the local '
+ 'AFL-CIO alleges the plant ran 60 percent production. On '
+ 'Easter, with 238 known infections, Smithfield finally '
+ 'agreed to shut down indefinitely after a request from the '
+ 'mayor and the governor. Yet the company insisted on waiting '
+ 'three more days to actually halt production.\n\nSmithfield '
+ 'denied contributing to the outbreak, saying it took a “very '
+ 'proactive approach.” Relying on racism, the company blamed '
+ 'workers for getting themselves sick. A spokesperson said '
+ 'the outbreak was so severe because of the plant’s “large '
+ 'immigrant population,” claming “Living circumstances in '
+ 'certain cultures are different than they are with your '
+ 'traditional American family.” They slandered the workers as '
+ 'dirty, ignorant, and untrustworthy with help from governor '
+ 'Kristi Noem, who claimed, “99 percent of what’s going on '
+ 'today wasn’t happening inside the facility. It was more at '
+ 'home, where these employees were going home and spreading '
+ 'some of the virus” by living too close together.\n\nOne '
+ 'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
+ 'from South Sudan, says, “With how we work on the line, '
+ 'I would say I got sick because of them not taking safety '
+ 'measures.” His job is “really, really close” to other '
+ 'workers chopping fresh-killed pigs. “The job is so heavy. '
+ 'You have to breathe so hard.”\n\nIn early March, '
+ 'union officials requested masks, overcoats, entrance '
+ 'checking for fevers, and less crowding in 500-capacity '
+ 'cafeterias. But Smithfield waited on most safety measures '
+ 'until early April. Only April 6 did they start checking for '
+ 'fevers. Instead of protective masks, they gave out beard '
+ 'nets.\n\nSmithfield concealed infections with a policy of '
+ 'informing only employees whose work stations were in the '
+ 'same area as a person who tested positive. The fact that '
+ 'workers are required to move around was willfully ignored. '
+ 'One worker who tested positive said, “I clearly would have '
+ 'gotten it at the factory. This week I have worked on three '
+ 'different floors. I’ve eaten in two different cafeterias … '
+ 'I’ve been walking through the whole place.” Employees from '
+ 'the eighth floor of the plant were quarantined, '
+ 'but everyone else was told to keep working.\n\nWhat Is '
+ 'Really Going On?\n\nAverage plant wages are around $16 an '
+ 'hour. Smithfield never raised them. Instead, they offered '
+ '$500 to employees who could go all of April without an '
+ 'unapproved day off. The company says their “Responsibility '
+ 'Bonuses” show their “immense gratefulness” to employees '
+ '“for their selfless sacrifices.”\n\nMeanwhile, the local '
+ 'Argus Leader wrote union members wanted essential-worker '
+ 'hazard pay, which “would be considered hourly compensation '
+ 'about 1.5 or two times their normal pay.” One worker said, '
+ '“I feel like they’re bribing us with [the bonus] to come to '
+ 'work sick. That’s how you know they don’t care.”\n\nBoth '
+ 'Sioux Falls workers killed by Covid-19 were in their '
+ 'sixties. It is unconscionable that they were still working. '
+ 'All meatpackers over 50 should be on paid leave. Agustín '
+ 'Rodríguez, 64, had a rough job sawing the legs off dead '
+ 'pigs. He mopped floors with a fever shortly before he was '
+ 'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
+ 'plant, he claimed, “We have continued to run our facilities '
+ 'for one reason: to sustain our nation’s food supply.” This '
+ 'is an effort to sweep Smithfield’s abuses under the rug, '
+ 'as if the company were operating for public benefit. This '
+ 'patriotic propaganda that all Americans are in it together '
+ 'is like a drug to keep workers from getting organized. '
+ },
+ ]
+ dataset = Dataset.from_list(ds_list)
+ op = DocumentSimhashDeduplicator(ignore_pattern=r'\p{P}')
+ self._run_simhash_dedup(dataset, tgt_list, op)
+
+ def test_chinese_deduplication(self):
+ ds_list = [
+ {
+ 'text': '你好,请问你是谁'
+ },
+ {
+ 'text': '欢迎来到阿里巴巴!'
+ },
+ {
+ 'text':
+ '第九届会议\n2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
+ '律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1. 国际'
+ '海底管理局大会第八届会议请秘书长采取一项临时措施,设立一个自愿信托基金,'
+ '以便支付来自发展中国家的法律和技术委员会成员以及来自发展中国家的财务委员'
+ '会成员参加委员会会议的费用。\n2. 由于秘书长向会员国发出为该信托基金捐款'
+ '的请求,已收到三笔捐款,共计10 500美元。 管理局已为基金设立一个单独的账'
+ '户。\n3. 管理局第八届会议还决定,由财务委员会审查资助参加这两个委员会会'
+ '议的方式,包括审查是否可能从管理局行政预算中提供经费。\n4. 自愿信托基金'
+ '迄今收到的捐款数额很小。 这两个委员会成员虽然由缔约国提名,但他们以个人身'
+ '份当选。 因此,必须确保这些机构的成员在任期内能够参加会议并且持续不断地履'
+ '行职务。 现已注意到,这两个委员会若干成员因旅费和生活津贴费用方面有困难而'
+ '未能出席会议。 来自发展中国家成员参加会议的费用估计数见附件,其中比较了经'
+ '济舱和公务舱机票价格以及适用于金斯敦的每日生活津贴费用。 从表中可以看出,'
+ '根据不同的人数、机舱等级和会议持续时间,每年平均需要捐款120 000美元至'
+ '215 000美元。\n5. 为了指导委员会确定提供经费的方式,对某些国际组织的现'
+ '行办法作了一次简要调查。 为支付参加会议的旅费和生活费而设立信托基金最相关'
+ '的实例是2000年大会为来自发展中国家的大陆架界限委员会成员设立的自愿信托基'
+ '金。 目前这一基金正在运作,但现有资源有限。 联合国制定的程序表明,委员会'
+ '成员的政府应在规定时间内尽可能提前提出请求。 这种请求按照先到先核可的办法'
+ '处理。 提供的机票将是最直接路线的经济舱机票,每日生活津贴将按照联合国费率'
+ '提供。 购买机票的所有安排均由联合国秘书处执行。\n6. 虽然已经设立了临时性'
+ '的自愿信托基金,但是,对该基金的捐款数额很小,捐款速度很慢。 因此,除了对'
+ '信托基金提供自愿捐款的办法之外,建议委员会还可以考虑采用下列办法:\n(a) '
+ '从管理局一般行政经费累计利息中拨出一定数额的经费;\n(b) 每年从上一年预算'
+ '未动用部分中拨出规定的数额;\n(c) 从先驱投资者基金利息中拨出规定的数额。'
+ '\n7. 委员会还不妨建议由管理局秘书处依照行政规则和程序管理该基金,并向财'
+ '务委员会提出一份报告。\n附件\n资助来自发展中国家的法律和技术委员会以及财'
+ '务\n委员会成员出席会议的指示性费用(美元)\n成员\n机票\n机场\n费用\n金'
+ '斯敦每日生活\n津贴\n转机途中每日生活\n7日\n共计\n14日\n经济舱\n公务舱'
+ '\n7天=(8天每日生活\n津贴)\n14天= (15天每日生活津贴)\n商务舱\n法律和技'
+ '术委员会\n印度尼西亚\n(纽约)\n黎巴嫩\n巴基斯坦\n阿根廷\n喀麦隆\n墨西哥'
+ '\n巴西\n塞内加尔\n莫桑比克\n埃及(纽约)\n大韩民国\n印度\n斐济\n智利\n'
+ '中国\n纳米比亚\n小计\n财务委员会\n缅甸\n乌干达\n牙买加\n印度(纽约)\n尼'
+ '日利亚\n总计\n注:估计费用表表明每年资助每个机构一次会议需要经费120 000'
+ '美元至215 000美元(四舍五入)。'
+ },
+ {
+ 'text':
+ '第九届会议\n时间:2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
+ '律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1. 国际'
+ '海底管理局大会第八届会议请秘书长采取一项临时措施,设立一个自愿信托基金,'
+ '以便支付来自发展中国家的法律和技术委员会成员以及来自发展中国家的财务委员'
+ '会成员参加委员会会议的费用。\n2. 由于秘书长向会员国发出为该信托基金捐款'
+ '的请求,已收到三笔捐款,共计10 500美元。 管理局已为基金设立一个单独的账'
+ '户。\n3. 管理局第八届会议还决定,由财务委员会审查资助参加这两个委员会会'
+ '议的方式,包括审查是否可能从管理局行政预算中提供经费。\n4. 自愿信托基金'
+ '迄今收到的捐款数额很小。 这两个委员会成员虽然由缔约国提名,但他们以个人身'
+ '份当选。 因此,必须确保这些机构的成员在任期内能够参加会议并且持续不断地履'
+ '行职务。 现已注意到,这两个委员会若干成员因旅费和生活津贴费用方面有困难而'
+ '未能出席会议。 来自发展中国家成员参加会议的费用估计数见附件,其中比较了经'
+ '济舱和公务舱机票价格以及适用于金斯敦的每日生活津贴费用。 从表中可以看出,'
+ '根据不同的人数、机舱等级和会议持续时间,每年平均需要捐款120 000美元至'
+ '215 000美元。\n5. 为了指导委员会确定提供经费的方式,对某些国际组织的现'
+ '行办法作了一次简要调查。 为支付参加会议的旅费和生活费而设立信托基金最相关'
+ '的实例是2000年大会为来自发展中国家的大陆架界限委员会成员设立的自愿信托基'
+ '金。 目前这一基金正在运作,但现有资源有限。 联合国制定的程序表明,委员会'
+ '成员的政府应在规定时间内尽可能提前提出请求。 这种请求按照先到先核可的办法'
+ '处理。 提供的机票将是最直接路线的经济舱机票,每日生活津贴将按照联合国费率'
+ '提供。 购买机票的所有安排均由联合国秘书处执行。\n6. 虽然已经设立了临时性'
+ '的自愿信托基金,但是,对该基金的捐款数额很小,捐款速度很慢。 因此,除了对'
+ '信托基金提供自愿捐款的办法之外,建议委员会还可以考虑采用下列办法:\n(a) '
+ '从管理局一般行政经费累计利息中拨出一定数额的经费;\n(b) 每年从上一年预算'
+ '未动用部分中拨出规定的数额;\n(c) 从先驱投资者基金利息中拨出规定的数额。'
+ '\n7. 委员会还不妨建议由管理局秘书处依照行政规则和程序管理该基金,并向财'
+ '务委员会提出一份报告。\n附件\n资助来自发展中国家的法律和技术委员会以及财'
+ '务\n委员会成员出席会议的指示性费用(美元)\n成员\n机票\n机场\n费用\n金'
+ '斯敦每日生活\n津贴\n转机途中每日生活\n7日\n共计\n14日\n经济舱\n公务舱'
+ '\n7天=(8天每日生活\n津贴)\n14天= (15天每日生活津贴)\n商务舱\n法律和技'
+ '术委员会\n印度尼西亚\n(纽约)\n黎巴嫩\n巴基斯坦\n阿根廷\n喀麦隆\n墨西哥'
+ '\n巴西\n塞内加尔\n莫桑比克\n埃及(纽约)\n大韩民国\n印度\n斐济\n智利\n'
+ '中国\n纳米比亚\n小计\n财务委员会\n缅甸\n乌干达\n牙买加\n印度(纽约)\n尼'
+ '日利亚\n总计\n注:估计费用表表明每年资助每个机构一次会议需要经费120 000'
+ '美元至215 000美元(四舍五入)。'
+ },
+ ]
+ tgt_list = [
+ {
+ 'text': '你好,请问你是谁'
+ },
+ {
+ 'text': '欢迎来到阿里巴巴!'
+ },
+ {
+ 'text':
+ '第九届会议\n2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
+ '律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1. 国际'
+ '海底管理局大会第八届会议请秘书长采取一项临时措施,设立一个自愿信托基金,'
+ '以便支付来自发展中国家的法律和技术委员会成员以及来自发展中国家的财务委员'
+ '会成员参加委员会会议的费用。\n2. 由于秘书长向会员国发出为该信托基金捐款'
+ '的请求,已收到三笔捐款,共计10 500美元。 管理局已为基金设立一个单独的账'
+ '户。\n3. 管理局第八届会议还决定,由财务委员会审查资助参加这两个委员会会'
+ '议的方式,包括审查是否可能从管理局行政预算中提供经费。\n4. 自愿信托基金'
+ '迄今收到的捐款数额很小。 这两个委员会成员虽然由缔约国提名,但他们以个人身'
+ '份当选。 因此,必须确保这些机构的成员在任期内能够参加会议并且持续不断地履'
+ '行职务。 现已注意到,这两个委员会若干成员因旅费和生活津贴费用方面有困难而'
+ '未能出席会议。 来自发展中国家成员参加会议的费用估计数见附件,其中比较了经'
+ '济舱和公务舱机票价格以及适用于金斯敦的每日生活津贴费用。 从表中可以看出,'
+ '根据不同的人数、机舱等级和会议持续时间,每年平均需要捐款120 000美元至'
+ '215 000美元。\n5. 为了指导委员会确定提供经费的方式,对某些国际组织的现'
+ '行办法作了一次简要调查。 为支付参加会议的旅费和生活费而设立信托基金最相关'
+ '的实例是2000年大会为来自发展中国家的大陆架界限委员会成员设立的自愿信托基'
+ '金。 目前这一基金正在运作,但现有资源有限。 联合国制定的程序表明,委员会'
+ '成员的政府应在规定时间内尽可能提前提出请求。 这种请求按照先到先核可的办法'
+ '处理。 提供的机票将是最直接路线的经济舱机票,每日生活津贴将按照联合国费率'
+ '提供。 购买机票的所有安排均由联合国秘书处执行。\n6. 虽然已经设立了临时性'
+ '的自愿信托基金,但是,对该基金的捐款数额很小,捐款速度很慢。 因此,除了对'
+ '信托基金提供自愿捐款的办法之外,建议委员会还可以考虑采用下列办法:\n(a) '
+ '从管理局一般行政经费累计利息中拨出一定数额的经费;\n(b) 每年从上一年预算'
+ '未动用部分中拨出规定的数额;\n(c) 从先驱投资者基金利息中拨出规定的数额。'
+ '\n7. 委员会还不妨建议由管理局秘书处依照行政规则和程序管理该基金,并向财'
+ '务委员会提出一份报告。\n附件\n资助来自发展中国家的法律和技术委员会以及财'
+ '务\n委员会成员出席会议的指示性费用(美元)\n成员\n机票\n机场\n费用\n金'
+ '斯敦每日生活\n津贴\n转机途中每日生活\n7日\n共计\n14日\n经济舱\n公务舱'
+ '\n7天=(8天每日生活\n津贴)\n14天= (15天每日生活津贴)\n商务舱\n法律和技'
+ '术委员会\n印度尼西亚\n(纽约)\n黎巴嫩\n巴基斯坦\n阿根廷\n喀麦隆\n墨西哥'
+ '\n巴西\n塞内加尔\n莫桑比克\n埃及(纽约)\n大韩民国\n印度\n斐济\n智利\n'
+ '中国\n纳米比亚\n小计\n财务委员会\n缅甸\n乌干达\n牙买加\n印度(纽约)\n尼'
+ '日利亚\n总计\n注:估计费用表表明每年资助每个机构一次会议需要经费120 000'
+ '美元至215 000美元(四舍五入)。'
+ },
+ ]
+ dataset = Dataset.from_list(ds_list)
+ op = DocumentSimhashDeduplicator(tokenization='character',
+ ignore_pattern=r'\p{P}')
+ self._run_simhash_dedup(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/__init__.py b/tests/ops/filter/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py
new file mode 100644
index 000000000..8dc1d6733
--- /dev/null
+++ b/tests/ops/filter/test_alphanumeric_filter.py
@@ -0,0 +1,83 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.alphanumeric_filter import AlphanumericFilter
+
+
+class AlphanumericFilterTest(unittest.TestCase):
+
+ def _run_alphanumeric_filter(self, dataset: Dataset, target_list, op):
+ if 'stats' not in dataset.features:
+ # TODO:
+ # this is a temp solution,
+ # only add stats when calling filter op
+ dataset = dataset.add_column(name='stats',
+ column=[{}] * dataset.num_rows)
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_case(self):
+
+ ds_list = [{
+ 'text': 'a=1\nb\nc=1+2+3+5\nd=6'
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sunda and it's a happy day!\nYou know"
+ }, {
+ 'text': 'a v s e e f g a qkc'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }, {
+ 'text': 'emoji表情测试下😊,😸31231\n'
+ }]
+ tgt_list = [{
+ 'text': 'a=1\nb\nc=1+2+3+5\nd=6'
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sunda and it's a happy day!\nYou know"
+ }, {
+ 'text': 'a v s e e f g a qkc'
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }, {
+ 'text': 'emoji表情测试下😊,😸31231\n'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = AlphanumericFilter(min_ratio=0.2, max_ratio=0.9)
+ self._run_alphanumeric_filter(dataset, tgt_list, op)
+
+ def test_token_case(self):
+
+ ds_list = [{
+ 'text': 'a=1\nb\nc=1+2+3+5\nd=6'
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sunda and it's a happy day!\nYou know"
+ }, {
+ 'text': 'a v s e e f g a qkc'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }, {
+ 'text': 'emoji表情测试下😊,😸31231\n'
+ }]
+ tgt_list = [{
+ 'text':
+ "Today is Sund Sund Sund Sunda and it's a happy day!\nYou know"
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = AlphanumericFilter(tokenization=True, min_ratio=1.5)
+ self._run_alphanumeric_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_average_line_length_filter.py b/tests/ops/filter/test_average_line_length_filter.py
new file mode 100644
index 000000000..fa1090f33
--- /dev/null
+++ b/tests/ops/filter/test_average_line_length_filter.py
@@ -0,0 +1,52 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.average_line_length_filter import \
+ AverageLineLengthFilter
+
+
+class AverageLineLengthFilterTest(unittest.TestCase):
+
+ def _run_average_line_length_filter(self, dataset: Dataset, target_list,
+ op):
+ if 'stats' not in dataset.features:
+ # TODO:
+ # this is a temp solution,
+ # only add stats when calling filter op
+ dataset = dataset.add_column(name='stats',
+ column=[{}] * dataset.num_rows)
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_case(self):
+
+ ds_list = [{
+ 'text': 'a=1\nb\nc=1+2+3+5\nd=6'
+ }, {
+ 'text':
+ "Today is Sund Sund Sunda and it's a happy day!\nYou know"
+ }, {
+ 'text': 'a v s e e f g a qkc'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }, {
+ 'text': 'emoji表情测试下😊,😸31231\n'
+ }]
+ tgt_list = [{
+ 'text': 'a v s e e f g a qkc'
+ }, {
+ 'text': 'emoji表情测试下😊,😸31231\n'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = AverageLineLengthFilter(min_len=10, max_len=20)
+ self._run_average_line_length_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_character_repetition_filter.py b/tests/ops/filter/test_character_repetition_filter.py
new file mode 100644
index 000000000..f1cf7a8db
--- /dev/null
+++ b/tests/ops/filter/test_character_repetition_filter.py
@@ -0,0 +1,50 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.character_repetition_filter import \
+ CharacterRepetitionFilter
+
+
+class CharacterRepetitionFilterTest(unittest.TestCase):
+
+ def _run_character_repetition_filter(self, dataset: Dataset, target_list,
+ op):
+ if 'stats' not in dataset.features:
+ # TODO:
+ # this is a temp solution,
+ # only add stats when calling filter op
+ dataset = dataset.add_column(name='stats',
+ column=[{}] * dataset.num_rows)
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_case(self):
+
+ ds_list = [{
+ 'text':
+ "Today is Sund Sund Sund Sund Sund Sunda and it's a happy day!"
+ }, {
+ 'text': 'a v s e c s f e f g a a a a a a a a a a'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': '中文也是一个字算一个长度'
+ }]
+ tgt_list = [{
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': '中文也是一个字算一个长度'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = CharacterRepetitionFilter(rep_len=10,
+ min_ratio=0.0,
+ max_ratio=0.4)
+ self._run_character_repetition_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_flagged_words_filter.py b/tests/ops/filter/test_flagged_words_filter.py
new file mode 100644
index 000000000..ce10c869e
--- /dev/null
+++ b/tests/ops/filter/test_flagged_words_filter.py
@@ -0,0 +1,76 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.flagged_words_filter import FlaggedWordFilter
+
+
+class FlaggedWordFilterTest(unittest.TestCase):
+
+ def _run_flagged_words_filter(self, dataset: Dataset, target_list, op):
+ if 'stats' not in dataset.features:
+ # TODO:
+ # this is a temp solution,
+ # only add stats when calling filter op
+ dataset = dataset.add_column(name='stats',
+ column=[{}] * dataset.num_rows)
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_en_case(self):
+
+ ds_list = [{
+ 'text': 'Today is anal cumshot day'
+ }, {
+ 'text': 'Fuck you doggystyle!'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }, {
+ 'text': 'emoji表情测试下😊,😸31231\n'
+ }]
+ tgt_list = [{
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }, {
+ 'text': 'emoji表情测试下😊,😸31231\n'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = FlaggedWordFilter(lang='en', max_ratio=0.045)
+ self._run_flagged_words_filter(dataset, tgt_list, op)
+
+ def test_zh_case(self):
+
+ ds_list = [{
+ 'text': '你是个卖淫女'
+ }, {
+ 'text': '根据算子使用情况增量安装方案确定'
+ }, {
+ 'text': '去除字母、数字、下划线占比过低或过高的代码'
+ }, {
+ 'text': '基于前一步结果,除掉打飞机、三级片等敏感词'
+ }, {
+ 'text': '使用片段分词器对每个页面进行分词,使用语言模型计算每个段落的困惑度得分,由此过滤低质量文本'
+ }]
+ tgt_list = [{
+ 'text': '根据算子使用情况增量安装方案确定'
+ }, {
+ 'text': '去除字母、数字、下划线占比过低或过高的代码'
+ }, {
+ 'text': '使用片段分词器对每个页面进行分词,使用语言模型计算每个段落的困惑度得分,由此过滤低质量文本'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = FlaggedWordFilter(lang='zh',
+ tokenization=True,
+ max_ratio=0.045,
+ use_words_aug=True)
+ self._run_flagged_words_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_language_id_score_filter.py b/tests/ops/filter/test_language_id_score_filter.py
new file mode 100644
index 000000000..78edef8d6
--- /dev/null
+++ b/tests/ops/filter/test_language_id_score_filter.py
@@ -0,0 +1,113 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.language_id_score_filter import \
+ LanguageIDScoreFilter
+
+
+class LanguageIDScoreFilterTest(unittest.TestCase):
+
+ def _run_language_id_score_filter(self, dataset: Dataset, target_list, op):
+ if 'stats' not in dataset.features:
+ # TODO:
+ # this is a temp solution,
+ # only add stats when calling filter op
+ dataset = dataset.add_column(name='stats',
+ column=[{}] * dataset.num_rows)
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_en_case(self):
+
+ ds_list = [{
+ 'text': 'a=1\nb\nc=1+2+3+5\nd=6'
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sunda and it's a happy day!\nYou know"
+ }, {
+ 'text': 'a v s e e f g a qkc'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }, {
+ 'text': 'emoji表情测试下😊,😸31231\n'
+ }]
+ tgt_list = [{
+ 'text':
+ "Today is Sund Sund Sund Sunda and it's a happy day!\nYou know"
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = LanguageIDScoreFilter(lang='en', min_score=0.8)
+ self._run_language_id_score_filter(dataset, tgt_list, op)
+
+ def test_zh_case(self):
+
+ ds_list = [{
+ 'text': 'a=1\nb\nc=1+2+3+5\nd=6'
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sunda and it's a happy day!\nYou know"
+ }, {
+ 'text': '我出生于2023年12月15日'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—'
+ }, {
+ 'text': '他的英文名字叫Harry Potter'
+ }, {
+ 'text': '这是一个测试'
+ }]
+ tgt_list = [{
+ 'text': '我出生于2023年12月15日'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—'
+ }, {
+ 'text': '他的英文名字叫Harry Potter'
+ }, {
+ 'text': '这是一个测试'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = LanguageIDScoreFilter(lang='zh', min_score=0.8)
+ self._run_language_id_score_filter(dataset, tgt_list, op)
+
+ def test_none_case(self):
+
+ ds_list = [{
+ 'text': 'a=1\nb\nc=1+2+3+5\nd=6'
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sunda and it's a happy day!\nYou know"
+ }, {
+ 'text': '我出生于2023年12月15日'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—'
+ }, {
+ 'text': '他的英文名字叫Harry Potter'
+ }, {
+ 'text': '这是一个测试'
+ }]
+ tgt_list = [{
+ 'text':
+ "Today is Sund Sund Sund Sunda and it's a happy day!\nYou know"
+ }, {
+ 'text': '我出生于2023年12月15日'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—'
+ }, {
+ 'text': '他的英文名字叫Harry Potter'
+ }, {
+ 'text': '这是一个测试'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = LanguageIDScoreFilter(lang='', min_score=0.8)
+ self._run_language_id_score_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_maximum_line_length_filter.py b/tests/ops/filter/test_maximum_line_length_filter.py
new file mode 100644
index 000000000..aa50fa286
--- /dev/null
+++ b/tests/ops/filter/test_maximum_line_length_filter.py
@@ -0,0 +1,52 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.maximum_line_length_filter import \
+ MaximumLineLengthFilter
+
+
+class MaximumLineLengthFilterTest(unittest.TestCase):
+
+ def _run_maximum_line_length_filter(self, dataset: Dataset, target_list,
+ op):
+ if 'stats' not in dataset.features:
+ # TODO:
+ # this is a temp solution,
+ # only add stats when calling filter op
+ dataset = dataset.add_column(name='stats',
+ column=[{}] * dataset.num_rows)
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_case(self):
+
+ ds_list = [{
+ 'text': 'a=1\nb\nc=1+2+3+5\nd=6'
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sunda and it's a happy day!\nYou know"
+ }, {
+ 'text': 'a v s e e f g a qkc'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }, {
+ 'text': 'emoji表情测试下😊,😸31231\n'
+ }]
+ tgt_list = [{
+ 'text': 'a v s e e f g a qkc'
+ }, {
+ 'text': 'emoji表情测试下😊,😸31231\n'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = MaximumLineLengthFilter(min_len=10, max_len=20)
+ self._run_maximum_line_length_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_perplexity_filter.py b/tests/ops/filter/test_perplexity_filter.py
new file mode 100644
index 000000000..cc07992a4
--- /dev/null
+++ b/tests/ops/filter/test_perplexity_filter.py
@@ -0,0 +1,50 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.perplexity_filter import PerplexityFilter
+
+
+class PerplexityFilterTest(unittest.TestCase):
+
+ def _run_perplexity_filter(self, dataset: Dataset, target_list, op):
+ if 'stats' not in dataset.features:
+ # TODO:
+ # this is a temp solution,
+ # only add stats when calling filter op
+ dataset = dataset.add_column(name='stats',
+ column=[{}] * dataset.num_rows)
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_en_case(self):
+
+ ds_list = [{
+ 'text': "Today is Sunday and it's a happy day!"
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sund Sunda and it's a happy day!"
+ }, {
+ 'text': 'a v s e c s f e f g a qkc'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }, {
+ 'text': 'emoji表情测试下😊,😸31231'
+ }]
+ tgt_list = [{
+ 'text': "Today is Sunday and it's a happy day!"
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = PerplexityFilter(lang='en', max_ppl=900)
+ self._run_perplexity_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_special_characters_filter.py b/tests/ops/filter/test_special_characters_filter.py
new file mode 100644
index 000000000..1ded47fef
--- /dev/null
+++ b/tests/ops/filter/test_special_characters_filter.py
@@ -0,0 +1,55 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.special_characters_filter import \
+ SpecialCharactersFilter
+
+
+class SpecialCharactersFilterTest(unittest.TestCase):
+
+ def _run_special_characters_filter(self, dataset: Dataset, target_list,
+ op):
+ if 'stats' not in dataset.features:
+ # TODO:
+ # this is a temp solution,
+ # only add stats when calling filter op
+ dataset = dataset.add_column(name='stats',
+ column=[{}] * dataset.num_rows)
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_case(self):
+
+ ds_list = [{
+ 'text': "Today is Sunday and it's a happy day!"
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sund Sunda and it's a happy day!"
+ }, {
+ 'text': 'a v s e c s f e f g a qkc'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }, {
+ 'text': 'emoji表情测试下😊,😸31231'
+ }]
+ tgt_list = [{
+ 'text': "Today is Sunday and it's a happy day!"
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sund Sunda and it's a happy day!"
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = SpecialCharactersFilter(min_ratio=0.0, max_ratio=0.25)
+ self._run_special_characters_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_specified_field_filter.py b/tests/ops/filter/test_specified_field_filter.py
new file mode 100644
index 000000000..99ef622e8
--- /dev/null
+++ b/tests/ops/filter/test_specified_field_filter.py
@@ -0,0 +1,147 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.specified_field_filter import SpecifiedFieldFilter
+
+
+class SpecifiedFieldFilterTest(unittest.TestCase):
+
+ def _run_specified_field_filter(self, dataset: Dataset, target_list, op):
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_case(self):
+
+ ds_list = [{
+ 'text': 'Today is Sun',
+ 'meta': {
+ 'suffix': '.pdf',
+ 'star': 50
+ }
+ }, {
+ 'text': 'a v s e c s f e f g a a a ',
+ 'meta': {
+ 'suffix': '.docx',
+ 'star': 6
+ }
+ }, {
+ 'text': '中文也是一个字算一个长度',
+ 'meta': {
+ 'suffix': '.txt',
+ 'star': 100
+ }
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!',
+ 'meta': {
+ 'suffix': '',
+ 'star': 12.51
+ }
+ }, {
+ 'text': 'dasdasdasdasdasdasdasd',
+ 'meta': {
+ 'suffix': None
+ }
+ }]
+ tgt_list = [{
+ 'text': 'Today is Sun',
+ 'meta': {
+ 'suffix': '.pdf',
+ 'star': 50
+ }
+ }, {
+ 'text': '中文也是一个字算一个长度',
+ 'meta': {
+ 'suffix': '.txt',
+ 'star': 100
+ }
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = SpecifiedFieldFilter(text_key='meta.suffix',
+ target_value=['.pdf', '.txt'])
+ self._run_specified_field_filter(dataset, tgt_list, op)
+
+ def test_list_case(self):
+
+ ds_list = [{
+ 'text': 'Today is Sun',
+ 'meta': {
+ 'suffix': '.pdf',
+ 'star': 50,
+ 'path': {
+ 'test': ['txt', 'json'],
+ 'test2': 'asadd'
+ }
+ }
+ }, {
+ 'text': 'a v s e c s f e f g a a a ',
+ 'meta': {
+ 'suffix': '.docx',
+ 'star': 6,
+ 'path': {
+ 'test': ['pdf', 'txt', 'xbs'],
+ 'test2': ''
+ }
+ }
+ }, {
+ 'text': '中文也是一个字算一个长度',
+ 'meta': {
+ 'suffix': '.txt',
+ 'star': 100,
+ 'path': {
+ 'test': ['docx', '', 'html'],
+ 'test2': 'abcd'
+ }
+ }
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!',
+ 'meta': {
+ 'suffix': '',
+ 'star': 12.51,
+ 'path': {
+ 'test': ['json'],
+ 'test2': 'aasddddd'
+ }
+ }
+ }, {
+ 'text': 'dasdasdasdasdasdasdasd',
+ 'meta': {
+ 'suffix': None,
+ 'star': 333,
+ 'path': {
+ 'test': ['pdf', 'txt', 'json', 'docx'],
+ 'test2': None
+ }
+ }
+ }]
+ tgt_list = [{
+ 'text': 'Today is Sun',
+ 'meta': {
+ 'suffix': '.pdf',
+ 'star': 50,
+ 'path': {
+ 'test': ['txt', 'json'],
+ 'test2': 'asadd'
+ }
+ }
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!',
+ 'meta': {
+ 'suffix': '',
+ 'star': 12.51,
+ 'path': {
+ 'test': ['json'],
+ 'test2': 'aasddddd'
+ }
+ }
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = SpecifiedFieldFilter(text_key='meta.path.test',
+ target_value=['pdf', 'txt', 'json'])
+ self._run_specified_field_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_specified_numeric_field_filter.py b/tests/ops/filter/test_specified_numeric_field_filter.py
new file mode 100644
index 000000000..813c47252
--- /dev/null
+++ b/tests/ops/filter/test_specified_numeric_field_filter.py
@@ -0,0 +1,204 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.specified_numeric_field_filter import \
+ SpecifiedNumericFieldFilter
+
+
+class SpecifiedNumericFieldFilterTest(unittest.TestCase):
+
+ def _run_specified_numeric_field_filter(self, dataset: Dataset,
+ target_list, op):
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_case(self):
+
+ ds_list = [{
+ 'text': 'Today is Sun',
+ 'meta': {
+ 'suffix': '.pdf',
+ 'star': 50
+ }
+ }, {
+ 'text': 'a v s e c s f e f g a a a ',
+ 'meta': {
+ 'suffix': '.docx',
+ 'star': 6
+ }
+ }, {
+ 'text': '中文也是一个字算一个长度',
+ 'meta': {
+ 'suffix': '.txt',
+ 'star': 100
+ }
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!',
+ 'meta': {
+ 'suffix': '.html',
+ 'star': 12.51
+ }
+ }, {
+ 'text': 'dasdasdasdasdasdasdasd',
+ 'meta': {
+ 'suffix': None
+ }
+ }]
+ tgt_list = [{
+ 'text': 'Today is Sun',
+ 'meta': {
+ 'suffix': '.pdf',
+ 'star': 50
+ }
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!',
+ 'meta': {
+ 'suffix': '.html',
+ 'star': 12.51
+ }
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = SpecifiedNumericFieldFilter(text_key='meta.star',
+ min_value=10,
+ max_value=70)
+ self._run_specified_numeric_field_filter(dataset, tgt_list, op)
+
+ def test_multi_case(self):
+
+ ds_list = [{
+ 'text': 'Today is Sun',
+ 'count': 101,
+ 'meta': {
+ 'suffix': '.pdf',
+ 'key1': {
+ 'key2': {
+ 'count': 34
+ },
+ 'count': 5
+ }
+ }
+ }, {
+ 'text': 'a v s e c s f e f g a a a ',
+ 'count': 16,
+ 'meta': {
+ 'suffix': '.docx',
+ 'key1': {
+ 'key2': {
+ 'count': 243
+ },
+ 'count': 63
+ }
+ }
+ }, {
+ 'text': '中文也是一个字算一个长度',
+ 'count': 162,
+ 'meta': {
+ 'suffix': '.txt',
+ 'key1': {
+ 'key2': {
+ 'count': None
+ },
+ 'count': 23
+ }
+ }
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!',
+ 'count': None,
+ 'meta': {
+ 'suffix': '.html',
+ 'key1': {
+ 'key2': {
+ 'count': 18
+ },
+ 'count': 48
+ }
+ }
+ }]
+ tgt_list = [{
+ 'text': 'Today is Sun',
+ 'count': 101,
+ 'meta': {
+ 'suffix': '.pdf',
+ 'key1': {
+ 'key2': {
+ 'count': 34
+ },
+ 'count': 5
+ }
+ }
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!',
+ 'count': None,
+ 'meta': {
+ 'suffix': '.html',
+ 'key1': {
+ 'key2': {
+ 'count': 18
+ },
+ 'count': 48
+ }
+ }
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = SpecifiedNumericFieldFilter(text_key='meta.key1.key2.count',
+ min_value=10,
+ max_value=70)
+ self._run_specified_numeric_field_filter(dataset, tgt_list, op)
+
+ def test_str_case(self):
+
+ ds_list = [{
+ 'text': 'Today is Sun',
+ 'meta': {
+ 'suffix': '.pdf',
+ 'star': '36'
+ }
+ }, {
+ 'text': 'a v s e c s f e f g a a a ',
+ 'meta': {
+ 'suffix': '.docx',
+ 'star': '13.5'
+ }
+ }, {
+ 'text': '中文也是一个字算一个长度',
+ 'meta': {
+ 'suffix': '.txt',
+ 'star': 'asdkc'
+ }
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!',
+ 'meta': {
+ 'suffix': '.html',
+ 'star': '441'
+ }
+ }, {
+ 'text': 'dasdasdasdasdasdasdasd',
+ 'meta': {
+ 'suffix': None
+ }
+ }]
+ tgt_list = [{
+ 'text': 'Today is Sun',
+ 'meta': {
+ 'suffix': '.pdf',
+ 'star': '36'
+ }
+ }, {
+ 'text': 'a v s e c s f e f g a a a ',
+ 'meta': {
+ 'suffix': '.docx',
+ 'star': '13.5'
+ }
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = SpecifiedNumericFieldFilter(text_key='meta.star',
+ min_value=10,
+ max_value=70)
+ self._run_specified_numeric_field_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_stop_words_filter.py b/tests/ops/filter/test_stop_words_filter.py
new file mode 100644
index 000000000..5ffcbc3ca
--- /dev/null
+++ b/tests/ops/filter/test_stop_words_filter.py
@@ -0,0 +1,76 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.stopwords_filter import StopWordsFilter
+
+
+class StopWordsFilterTest(unittest.TestCase):
+
+ def _run_stopwords_filter(self, dataset: Dataset, target_list, op):
+ if 'stats' not in dataset.features:
+ # TODO:
+ # this is a temp solution,
+ # only add stats when calling filter op
+ dataset = dataset.add_column(name='stats',
+ column=[{}] * dataset.num_rows)
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_en_case(self):
+
+ ds_list = [{
+ 'text': "Today is Sunday and it's a happy day!"
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sund Sunda and it's a happy day!"
+ }, {
+ 'text': 'a v s e c s f e f g a qkc'
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }]
+ tgt_list = [{
+ 'text': "Today is Sunday and it's a happy day!"
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sund Sunda and it's a happy day!"
+ }, {
+ 'text': 'Do you need a cup of coffee?'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = StopWordsFilter(lang='en', min_ratio=0.3)
+ self._run_stopwords_filter(dataset, tgt_list, op)
+
+ def test_zh_case(self):
+
+ ds_list = [{
+ 'text': '你好,请问你是谁'
+ }, {
+ 'text': '字母、数字、下划线、占比、代码'
+ }, {
+ 'text': '基于前一步结果,在同一个聚类中找出那些过长文档为假正例,暂不进行滤除'
+ }, {
+ 'text': '使用片段分词器对每个页面进行分词,使用语言模型计算每个段落的困惑度得分,由此过滤低质量文本'
+ }]
+ tgt_list = [{
+ 'text': '你好,请问你是谁'
+ }, {
+ 'text': '基于前一步结果,在同一个聚类中找出那些过长文档为假正例,暂不进行滤除'
+ }, {
+ 'text': '使用片段分词器对每个页面进行分词,使用语言模型计算每个段落的困惑度得分,由此过滤低质量文本'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = StopWordsFilter(lang='zh',
+ tokenization=True,
+ min_ratio=0.2,
+ use_words_aug=True)
+ self._run_stopwords_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_suffix_filter.py b/tests/ops/filter/test_suffix_filter.py
new file mode 100644
index 000000000..8d90d1885
--- /dev/null
+++ b/tests/ops/filter/test_suffix_filter.py
@@ -0,0 +1,119 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.suffix_filter import SuffixFilter
+
+
+class SuffixFilterTest(unittest.TestCase):
+
+ def _run_suffix_filter(self, dataset: Dataset, target_list, op):
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_case(self):
+
+ ds_list = [{
+ 'text': 'Today is Sun',
+ 'meta': {
+ 'suffix': '.pdf'
+ }
+ }, {
+ 'text': 'a v s e c s f e f g a a a ',
+ 'meta': {
+ 'suffix': '.docx'
+ }
+ }, {
+ 'text': '中文也是一个字算一个长度',
+ 'meta': {
+ 'suffix': '.txt'
+ }
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!',
+ 'meta': {
+ 'suffix': '.html'
+ }
+ }, {
+ 'text': 'dasdasdasdasdasdasdasd',
+ 'meta': {
+ 'suffix': '.py'
+ }
+ }]
+ tgt_list = [{
+ 'text': 'Today is Sun',
+ 'meta': {
+ 'suffix': '.pdf'
+ }
+ }, {
+ 'text': '中文也是一个字算一个长度',
+ 'meta': {
+ 'suffix': '.txt'
+ }
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = SuffixFilter(suffixes=['.txt', '.pdf'])
+ self._run_suffix_filter(dataset, tgt_list, op)
+
+ def test_none_case(self):
+
+ ds_list = [{
+ 'text': 'Today is Sun',
+ 'meta': {
+ 'suffix': '.pdf'
+ }
+ }, {
+ 'text': 'a v s e c s f e f g a a a ',
+ 'meta': {
+ 'suffix': '.docx'
+ }
+ }, {
+ 'text': '中文也是一个字算一个长度',
+ 'meta': {
+ 'suffix': '.txt'
+ }
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!',
+ 'meta': {
+ 'suffix': '.html'
+ }
+ }, {
+ 'text': 'dasdasdasdasdasdasdasd',
+ 'meta': {
+ 'suffix': '.py'
+ }
+ }]
+ tgt_list = [{
+ 'text': 'Today is Sun',
+ 'meta': {
+ 'suffix': '.pdf'
+ }
+ }, {
+ 'text': 'a v s e c s f e f g a a a ',
+ 'meta': {
+ 'suffix': '.docx'
+ }
+ }, {
+ 'text': '中文也是一个字算一个长度',
+ 'meta': {
+ 'suffix': '.txt'
+ }
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!',
+ 'meta': {
+ 'suffix': '.html'
+ }
+ }, {
+ 'text': 'dasdasdasdasdasdasdasd',
+ 'meta': {
+ 'suffix': '.py'
+ }
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = SuffixFilter()
+ self._run_suffix_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_text_length_filter.py b/tests/ops/filter/test_text_length_filter.py
new file mode 100644
index 000000000..fea4f25aa
--- /dev/null
+++ b/tests/ops/filter/test_text_length_filter.py
@@ -0,0 +1,50 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.text_length_filter import TextLengthFilter
+
+
+class TextLengthFilterTest(unittest.TestCase):
+
+ def _run_text_length_filter(self, dataset: Dataset, target_list, op):
+ if 'stats' not in dataset.features:
+ # TODO:
+ # this is a temp solution,
+ # only add stats when calling filter op
+ dataset = dataset.add_column(name='stats',
+ column=[{}] * dataset.num_rows)
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_case(self):
+
+ ds_list = [{
+ 'text': 'Today is'
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sund Sund Sunda and it's a happy day!"
+ }, {
+ 'text': 'a v s e c s f e f g a a a '
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': '中文也是一个字算一个长度'
+ }]
+ tgt_list = [{
+ 'text': 'a v s e c s f e f g a a a '
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }, {
+ 'text': '中文也是一个字算一个长度'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = TextLengthFilter(min_len=10, max_len=50)
+ self._run_text_length_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_word_num_filter.py b/tests/ops/filter/test_word_num_filter.py
new file mode 100644
index 000000000..e7ee02415
--- /dev/null
+++ b/tests/ops/filter/test_word_num_filter.py
@@ -0,0 +1,74 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.word_num_filter import WordNumFilter
+
+
+class WordNumFilterTest(unittest.TestCase):
+
+ def _run_word_num_filter(self, dataset: Dataset, target_list, op):
+ if 'stats' not in dataset.features:
+ # TODO:
+ # this is a temp solution,
+ # only add stats when calling filter op
+ dataset = dataset.add_column(name='stats',
+ column=[{}] * dataset.num_rows)
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_case(self):
+
+ ds_list = [{
+ 'text': 'Today is Sun'
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sund Sund Sunda and it's a happy day!"
+ }, {
+ 'text': 'a v s e c s f e f g a a a '
+ }, {
+ 'text': ',。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►'
+ }]
+ tgt_list = [{
+ 'text':
+ "Today is Sund Sund Sund Sund Sund Sunda and it's a happy day!"
+ }, {
+ 'text': 'a v s e c s f e f g a a a '
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = WordNumFilter(min_num=5, max_num=15)
+ self._run_word_num_filter(dataset, tgt_list, op)
+
+ def test_zh_case(self):
+
+ ds_list = [{
+ 'text': '你好,请问你是谁'
+ }, {
+ 'text': '欢迎来到阿里巴巴'
+ }, {
+ 'text': '根据算子使用情况增量安装方案确定'
+ }, {
+ 'text': '去除字母、数字、下划线占比过低或过高的代码'
+ }, {
+ 'text': '使用片段分词器对每个页面进行分词,使用语言模型计算每个段落的困惑度得分,由此过滤低质量文本'
+ }, {
+ 'text': '基于前一步结果,在同一个聚类中找出那些过长文档为假正例,暂不进行滤除'
+ }]
+ tgt_list = [{
+ 'text': '去除字母、数字、下划线占比过低或过高的代码'
+ }, {
+ 'text': '基于前一步结果,在同一个聚类中找出那些过长文档为假正例,暂不进行滤除'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = WordNumFilter(lang='zh',
+ tokenization=True,
+ min_num=10,
+ max_num=25)
+ self._run_word_num_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/filter/test_word_repetition_filter.py b/tests/ops/filter/test_word_repetition_filter.py
new file mode 100644
index 000000000..8678ff5b6
--- /dev/null
+++ b/tests/ops/filter/test_word_repetition_filter.py
@@ -0,0 +1,85 @@
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.filter.word_repetition_filter import WordRepetitionFilter
+
+
+class WordRepetitionFilterTest(unittest.TestCase):
+
+ def _run_word_repetition_filter(self, dataset: Dataset, target_list, op):
+ if 'stats' not in dataset.features:
+ # TODO:
+ # this is a temp solution,
+ # only add stats when calling filter op
+ dataset = dataset.add_column(name='stats',
+ column=[{}] * dataset.num_rows)
+ dataset = dataset.map(op.compute_stats)
+ dataset = dataset.filter(op.process)
+ dataset = dataset.select_columns(column_names=['text'])
+ res_list = dataset.to_list()
+ self.assertEqual(res_list, target_list)
+
+ def test_en_case(self):
+
+ ds_list = [{
+ 'text':
+ "Today is Sunday Sunday Sunday Sunday Sunday and it's a happy day!"
+ }, {
+ 'text':
+ "Today is Sunday Sunday Sunday and it's a happy day!"
+ }, {
+ 'text':
+ "Today is Sund Sund Sund Sund Sund Sunda and it's a happy day!"
+ }, {
+ 'text':
+ "plusieurs èrdash@hqbchd.ckd d'accéder à ces wwwasdasd fonc"
+ }, {
+ 'text':
+ 'This proposed a novel proposed pretraining proposed pretraining.'
+ }]
+ tgt_list = [{
+ 'text':
+ "Today is Sunday Sunday Sunday and it's a happy day!"
+ }, {
+ 'text':
+ "plusieurs èrdash@hqbchd.ckd d'accéder à ces wwwasdasd fonc"
+ }, {
+ 'text':
+ 'This proposed a novel proposed pretraining proposed pretraining.'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = WordRepetitionFilter(rep_len=3, min_ratio=0.0, max_ratio=0.2)
+ self._run_word_repetition_filter(dataset, tgt_list, op)
+
+ def test_zh_case(self):
+
+ ds_list = [{
+ 'text': '去除字母、数字、下划线占比过低或过高的代码'
+ }, {
+ 'text': '欢迎来到阿里巴巴巴巴巴巴巴巴'
+ }, {
+ 'text': '使用片段分词器对每个页面进行分词,使用语言模型计算每个段落的困惑度得分'
+ }, {
+ 'text': '根据算子使用使用使用使用安装方案确定'
+ }, {
+ 'text': '基于前一步结果,在同一个聚类中找出那些过长文档为假正例,暂不进行滤除'
+ }]
+ tgt_list = [{
+ 'text': '去除字母、数字、下划线占比过低或过高的代码'
+ }, {
+ 'text': '使用片段分词器对每个页面进行分词,使用语言模型计算每个段落的困惑度得分'
+ }, {
+ 'text': '基于前一步结果,在同一个聚类中找出那些过长文档为假正例,暂不进行滤除'
+ }]
+ dataset = Dataset.from_list(ds_list)
+ op = WordRepetitionFilter(lang='zh',
+ tokenization=True,
+ rep_len=3,
+ min_ratio=0.0,
+ max_ratio=0.2)
+ self._run_word_repetition_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/mapper/__init__.py b/tests/ops/mapper/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/ops/mapper/test_clean_copyright_mapper.py b/tests/ops/mapper/test_clean_copyright_mapper.py
new file mode 100644
index 000000000..302942d26
--- /dev/null
+++ b/tests/ops/mapper/test_clean_copyright_mapper.py
@@ -0,0 +1,41 @@
+import unittest
+
+from data_juicer.ops.mapper.clean_copyright_mapper import CleanCopyrightMapper
+
+
+class CleanCopyrightMapperTest(unittest.TestCase):
+
+ def setUp(self):
+ self.op = CleanCopyrightMapper()
+
+ def _run_clean_copyright(self, samples):
+ for sample in samples:
+ result = self.op.process(sample)
+ self.assertEqual(result['text'], result['target'])
+
+ def test_clean_copyright(self):
+
+ samples = [{
+ 'text': '这是一段 /* 多行注释\n注释内容copyright\n*/ 的文本。另外还有一些 // 单行注释。',
+ 'target': '这是一段 的文本。另外还有一些 // 单行注释。'
+ }, {
+ 'text': '如果多行/*注释中没有\n关键词,那么\n这部分注释也不会\n被清除*/\n会保留下来',
+ 'target': '如果多行/*注释中没有\n关键词,那么\n这部分注释也不会\n被清除*/\n会保留下来'
+ }, {
+ 'text': '//if start with\n//that will be cleand \n envenly',
+ 'target': ' envenly'
+ }, {
+ 'text': 'http://www.nasosnsncc.com',
+ 'target': 'http://www.nasosnsncc.com'
+ }, {
+ 'text': '#if start with\nthat will be cleand \n#envenly',
+ 'target': 'that will be cleand \n#envenly'
+ }, {
+ 'text': '--if start with\n--that will be cleand \n#envenly',
+ 'target': ''
+ }]
+ self._run_clean_copyright(samples)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/mapper/test_clean_email_mapper.py b/tests/ops/mapper/test_clean_email_mapper.py
new file mode 100644
index 000000000..2b3d0fa80
--- /dev/null
+++ b/tests/ops/mapper/test_clean_email_mapper.py
@@ -0,0 +1,35 @@
+import unittest
+
+from data_juicer.ops.mapper.clean_email_mapper import CleanEmailMapper
+
+
+class CleanEmailMapperTest(unittest.TestCase):
+
+ def setUp(self):
+ self.op = CleanEmailMapper()
+
+ def _run_clean_email(self, samples):
+ for sample in samples:
+ result = self.op.process(sample)
+ self.assertEqual(result['text'], result['target'])
+
+ def test_clean_email(self):
+
+ samples = [{
+ 'text': 'happy day euqdh@cjqi.com',
+ 'target': 'happy day '
+ }, {
+ 'text': '请问你是谁dasoidhao@1264fg.45om',
+ 'target': '请问你是谁dasoidhao@1264fg.45om'
+ }, {
+ 'text': 'ftp://examplema-nièrdash@hqbchd.ckdhnfes.cds',
+ 'target': 'ftp://examplema-niè'
+ }, {
+ 'text': '👊23da44sh12@46hqb12chd.ckdhnfes.comd.dasd.asd.dc',
+ 'target': '👊'
+ }]
+ self._run_clean_email(samples)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/ops/mapper/test_clean_html_mapper.py b/tests/ops/mapper/test_clean_html_mapper.py
new file mode 100644
index 000000000..ecab4114d
--- /dev/null
+++ b/tests/ops/mapper/test_clean_html_mapper.py
@@ -0,0 +1,212 @@
+import unittest
+
+from data_juicer.ops.mapper.clean_html_mapper import CleanHtmlMapper
+
+
+class CleanHtmlMapperTest(unittest.TestCase):
+
+ def setUp(self):
+ self.op = CleanHtmlMapper()
+
+ def _run_helper(self, samples):
+ for sample in samples:
+ result = self.op.process(sample)
+ self.assertEqual(result['text'], result['target'])
+
+ def test_complete_html_text(self):
+
+ samples = [
+ {
+ 'text':
+ ''
+ '
Welcome to My Website
'
+ '
Lorem ipsum dolor sit amet, consectetur adipiscing elit.'
+ '