Skip to content

Commit

Permalink
updating examples
Browse files Browse the repository at this point in the history
Signed-off-by: AntonEliatra <[email protected]>
  • Loading branch information
AntonEliatra committed Aug 9, 2024
1 parent 995516c commit 47cff69
Showing 1 changed file with 68 additions and 40 deletions.
108 changes: 68 additions & 40 deletions _analyzers/token-filters/cjk-bigram.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,25 +48,23 @@ This option, when set to `true`, outputs both unigrams (single characters) and b
The following example request creates a new index named `devanagari_example_index` and defines an analyzer with the `cjk_bigram_filter` filter and `ignore_scripts` parameter set to `deva`:

```json
PUT /devanagari_example_index
PUT /cjk_bigram_example
{
"settings": {
"analysis": {
"filter": {
"cjk_bigram_filter": {
"type": "cjk_bigram",
"ignore_scripts": ["deva"],
"output_unigrams": false
}
},
"analyzer": {
"cjk_deva_analyzer": {
"type": "custom",
"cjk_bigrams_no_katakana": {
"tokenizer": "standard",
"filter": [
"lowercase",
"cjk_bigram_filter"
]
"filter": [ "cjk_bigrams_no_katakana_filter" ]
}
},
"filter": {
"cjk_bigrams_no_katakana_filter": {
"type": "cjk_bigram",
"ignored_scripts": [
"katakana"
],
"output_unigrams": true
}
}
}
Expand All @@ -80,59 +78,89 @@ PUT /devanagari_example_index
Use the following request to examine the tokens generated using the created analyzer:

```json
POST /devanagari_example_index/_analyze
POST /cjk_bigram_example/_analyze
{
"analyzer": "cjk_deva_analyzer",
"text": "यह एक उदाहरण है 中文文本" // Devanagari text followed by Chinese
"analyzer": "cjk_bigrams_no_katakana",
"text": "東京タワーに行く"
}
```
{% include copy-curl.html %}

Sample Text: "東京タワーに行く"

東京 (Kanji for "Tokyo")
タワー (Katakana for "Tower")
に行く (Hiragana and Kanji for "go to")

The response contains the generated tokens:

```json
{
"tokens": [
{
"token": "यह",
"token": "",
"start_offset": 0,
"end_offset": 3,
"type": "<ALPHANUM>",
"end_offset": 1,
"type": "<SINGLE>",
"position": 0
},
{
"token": "एक",
"start_offset": 4,
"end_offset": 7,
"type": "<ALPHANUM>",
"token": "東京",
"start_offset": 0,
"end_offset": 2,
"type": "<DOUBLE>",
"position": 0,
"positionLength": 2
},
{
"token": "",
"start_offset": 1,
"end_offset": 2,
"type": "<SINGLE>",
"position": 1
},
{
"token": "उदाहरण",
"start_offset": 8,
"end_offset": 16,
"type": "<ALPHANUM>",
"token": "タワー",
"start_offset": 2,
"end_offset": 5,
"type": "<KATAKANA>",
"position": 2
},
{
"token": "है",
"start_offset": 17,
"end_offset": 19,
"type": "<ALPHANUM>",
"token": "",
"start_offset": 5,
"end_offset": 6,
"type": "<SINGLE>",
"position": 3
},
{
"token": "中文",
"start_offset": 20,
"end_offset": 22,
"type": "<ALPHANUM>",
"token": "に行",
"start_offset": 5,
"end_offset": 7,
"type": "<DOUBLE>",
"position": 3,
"positionLength": 2
},
{
"token": "",
"start_offset": 6,
"end_offset": 7,
"type": "<SINGLE>",
"position": 4
},
{
"token": "文本",
"start_offset": 22,
"end_offset": 24,
"type": "<ALPHANUM>",
"token": "行く",
"start_offset": 6,
"end_offset": 8,
"type": "<DOUBLE>",
"position": 4,
"positionLength": 2
},
{
"token": "",
"start_offset": 7,
"end_offset": 8,
"type": "<SINGLE>",
"position": 5
}
]
Expand Down

0 comments on commit 47cff69

Please sign in to comment.