diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..183b543 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +build/ +.gradle/ +.idea/ +.git/ diff --git a/src/main/java/org/elasticsearch/analysis/JasoDecomposer.java b/src/main/java/org/elasticsearch/analysis/JasoDecomposer.java index 7711d8d..5a18a6b 100755 --- a/src/main/java/org/elasticsearch/analysis/JasoDecomposer.java +++ b/src/main/java/org/elasticsearch/analysis/JasoDecomposer.java @@ -25,8 +25,7 @@ public String runJasoDecompose(String originStr, TokenizerOptions options) { if (!originStr.isEmpty()) { - //공백, lowercase 처리 - originStr = originStr.replace(" ", ""); + //lowercase 처리 originStr = originStr.toLowerCase(); char[] termBuffer = originStr.toCharArray(); @@ -201,16 +200,44 @@ public String runJasoDecompose(String originStr, TokenizerOptions options) { } //결과 조합 + + //공백을 붙인 전체 문자열 (한글) + if (korBuffer.indexOf(" ") != -1) { + if (korBuffer.length() > 0) { + returnBuffer.append(korBuffer.toString().replaceAll(" ", "")); + returnBuffer.append(" "); + } + } + + //공백으로 분리된 문자열 (한글) if (korBuffer.length() > 0) { returnBuffer.append(korBuffer.toString()); returnBuffer.append(" "); } + //공백을 붙인 전체 문자열 (영문) + if (engBuffer.indexOf(" ") != -1) { + if (engBuffer.length() > 0) { + returnBuffer.append(engBuffer.toString().replaceAll(" ", "")); + returnBuffer.append(" "); + } + } + + //공백으로 분리된 문자열 (영문) if (engBuffer.length() > 0) { returnBuffer.append(engBuffer.toString()); returnBuffer.append(" "); } + //공백을 붙인 전체 문자열 (오타) + if (mistypingBuffer.indexOf(" ") != -1) { + if (mistypingBuffer.length() > 0) { + returnBuffer.append(mistypingBuffer.toString().replaceAll(" ", "")); + returnBuffer.append(" "); + } + } + + //공백으로 분리된 문자열 (오타) if (mistypingBuffer.length() > 0) { returnBuffer.append(mistypingBuffer.toString()); returnBuffer.append(" "); diff --git a/src/test/java/org/elasticsearch/analysis/JasoTokenizerTest.java b/src/test/java/org/elasticsearch/analysis/JasoTokenizerTest.java index d9812a6..02a4c85 100755 --- a/src/test/java/org/elasticsearch/analysis/JasoTokenizerTest.java +++ b/src/test/java/org/elasticsearch/analysis/JasoTokenizerTest.java @@ -23,10 +23,10 @@ public void testTokenizer() throws IOException { TokenizerOptions options = TokenizerOptions.create("testTokenizer"); //한영오타에 대한 토큰 추출여부 (hello -> ㅗㄷㅣㅣㅐ, 최일규 -> chldlfrb) - options.setMistype(true); + options.setMistype(false); //초성검색을 위한 토큰 추출여부 (최일규 -> ㅊㅇㄱ) - options.setChosung(true); + options.setChosung(false); List testCase = new ArrayList(); @@ -36,23 +36,9 @@ public void testTokenizer() throws IOException { testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ/thsutleo/ㅅㄴㅅㄷ")); testCase.add(new TestCaseVO("Hello", "hello/ㅗㄷㅣㅣㅐ")); testCase.add(new TestCaseVO("Hello~", "hello~/ㅗㄷㅣㅣㅐ~")); - testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/anwhrjsgovldpseld/ㅁㅈㄱㅎㅍㅇㄷ")); - testCase.add(new TestCaseVO("소원을 말해봐 genie mixversion##!", "ㅅㅗㅇㅜㅓㄴㅇㅡㄹㅁㅏㄹㅎㅐㅂㅗㅏgeniemixversion##!/thdnjsdmfakfgoqhkgeniemixversion##!/ㅅㅇㅇㅁㅎㅂ")); - testCase.add(new TestCaseVO("hush hush; hush hush", "hushhush;hushhush/ㅗㅕㄴㅗㅗㅕㄴㅗ;ㅗㅕㄴㅗㅗㅕㄴㅗ")); - testCase.add(new TestCaseVO("라벨:모음곡(거울)제4곡(어릿광대의 아침노래)(서현)", "ㄹㅏㅂㅔㄹ:ㅁㅗㅇㅡㅁㄱㅗㄱ(ㄱㅓㅇㅜㄹ)ㅈㅔ4ㄱㅗㄱ(ㅇㅓㄹㅣㅅㄱㅗㅏㅇㄷㅐㅇㅡㅣㅇㅏㅊㅣㅁㄴㅗㄹㅐ)(ㅅㅓㅎㅕㄴ)/fkqpf:ahdmarhr(rjdnf)wp4rhr(djfltrhkdeodmldkclashfo)(tjgus)/ㄹㅂㅁㅇㄱㄱㅇㅈㄱㅇㄹㄱㄷㅇㅇㅊㄴㄹㅅㅎ")); - testCase.add(new TestCaseVO("ㅗ디ㅣㅐ", "ㅗㄷㅣㅣㅐ")); //hello - testCase.add(new TestCaseVO("째깅", "ㅈㅈㅐㄱㅣㅇ/world/ㅈㅈㄱ")); //World - testCase.add(new TestCaseVO("ㅣㅐ햣ㄷ초", "ㅣㅐㅎㅑㅅㄷㅊㅗ")); //logitech - testCase.add(new TestCaseVO("퍗므ㅑㅜ", "ㅍㅑㅅㅁㅡㅑㅜ")); //vitamin - testCase.add(new TestCaseVO("랱", "ㄹㅐㅌ/fox")); //fox - testCase.add(new TestCaseVO("ㅅ딛퍄냐ㅐㅜ", "ㅅㄷㅣㄷㅍㅑㄴㅑㅐㅜ")); //television - testCase.add(new TestCaseVO("최", "ㅊㅗㅣ/chl")); //television - testCase.add(new TestCaseVO("ㅅㄴㅅㄷ", "ㅅㄴㅅㄷ")); //소녀시대 초성검색 - testCase.add(new TestCaseVO("##%@#$%()*&^%$#@!", "##%@#$%()*&^%$#@!")); - testCase.add(new TestCaseVO("2000 최신가요", "2000ㅊㅗㅣㅅㅣㄴㄱㅏㅇㅛ/2000chltlsrkdy")); - testCase.add(new TestCaseVO("15&", "15&")); - testCase.add(new TestCaseVO("최신가요 1990", "ㅊㅗㅣㅅㅣㄴㄱㅏㅇㅛ1990/chltlsrkdy1990/ㅊㅅㄱㅇ")); - testCase.add(new TestCaseVO("최신가요&", "ㅊㅗㅣㅅㅣㄴㄱㅏㅇㅛ&/chltlsrkdy&/ㅊㅅㄱㅇ")); + testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/anwhrjsgovldpseld/anwhrjs/govldpseld/ㅁㅈㄱㅎㅍㅇㄷ")); + testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ/dkelektmdnsehdghk/dkelektm/dnsehdghk/ㅇㄷㄷㅅㅇㄷㅎ")); + testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ/xnepdlxmrrk/ㅌㄷㅇㅌㄱ")); } else if (options.isMistype() == true && options.isChosung() == false) { @@ -60,23 +46,9 @@ public void testTokenizer() throws IOException { testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ/thsutleo")); testCase.add(new TestCaseVO("Hello", "hello/ㅗㄷㅣㅣㅐ")); testCase.add(new TestCaseVO("Hello~", "hello~/ㅗㄷㅣㅣㅐ~")); - testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/anwhrjsgovldpseld")); - testCase.add(new TestCaseVO("소원을 말해봐 genie mixversion##!", "ㅅㅗㅇㅜㅓㄴㅇㅡㄹㅁㅏㄹㅎㅐㅂㅗㅏgeniemixversion##!/thdnjsdmfakfgoqhkgeniemixversion##!")); - testCase.add(new TestCaseVO("hush hush; hush hush", "hushhush;hushhush/ㅗㅕㄴㅗㅗㅕㄴㅗ;ㅗㅕㄴㅗㅗㅕㄴㅗ")); - testCase.add(new TestCaseVO("라벨:모음곡(거울)제4곡(어릿광대의 아침노래)(서현)", "ㄹㅏㅂㅔㄹ:ㅁㅗㅇㅡㅁㄱㅗㄱ(ㄱㅓㅇㅜㄹ)ㅈㅔ4ㄱㅗㄱ(ㅇㅓㄹㅣㅅㄱㅗㅏㅇㄷㅐㅇㅡㅣㅇㅏㅊㅣㅁㄴㅗㄹㅐ)(ㅅㅓㅎㅕㄴ)/fkqpf:ahdmarhr(rjdnf)wp4rhr(djfltrhkdeodmldkclashfo)(tjgus)")); - testCase.add(new TestCaseVO("ㅗ디ㅣㅐ", "ㅗㄷㅣㅣㅐ")); //hello - testCase.add(new TestCaseVO("째깅", "ㅈㅈㅐㄱㅣㅇ/world")); //World - testCase.add(new TestCaseVO("ㅣㅐ햣ㄷ초", "ㅣㅐㅎㅑㅅㄷㅊㅗ")); //logitech - testCase.add(new TestCaseVO("퍗므ㅑㅜ", "ㅍㅑㅅㅁㅡㅑㅜ")); //vitamin - testCase.add(new TestCaseVO("랱", "ㄹㅐㅌ/fox")); //fox - testCase.add(new TestCaseVO("ㅅ딛퍄냐ㅐㅜ", "ㅅㄷㅣㄷㅍㅑㄴㅑㅐㅜ")); //television - testCase.add(new TestCaseVO("최", "ㅊㅗㅣ/chl")); //television - testCase.add(new TestCaseVO("ㅅㄴㅅㄷ", "ㅅㄴㅅㄷ")); //소녀시대 초성검색 - testCase.add(new TestCaseVO("##%@#$%()*&^%$#@!", "##%@#$%()*&^%$#@!")); - testCase.add(new TestCaseVO("2000 최신가요", "2000ㅊㅗㅣㅅㅣㄴㄱㅏㅇㅛ/2000chltlsrkdy")); - testCase.add(new TestCaseVO("15&", "15&")); - testCase.add(new TestCaseVO("최신가요 1990", "ㅊㅗㅣㅅㅣㄴㄱㅏㅇㅛ1990/chltlsrkdy1990")); - testCase.add(new TestCaseVO("최신가요&", "ㅊㅗㅣㅅㅣㄴㄱㅏㅇㅛ&/chltlsrkdy&")); + testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/anwhrjsgovldpseld/anwhrjs/govldpseld")); + testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ/dkelektmdnsehdghk/dkelektm/dnsehdghk")); + testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ/xnepdlxmrrk")); } else if (options.isMistype() == false && options.isChosung() == true) { @@ -84,23 +56,9 @@ public void testTokenizer() throws IOException { testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ/ㅅㄴㅅㄷ")); testCase.add(new TestCaseVO("Hello", "hello")); testCase.add(new TestCaseVO("Hello~", "hello~")); - testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅈㄱㅎㅍㅇㄷ")); - testCase.add(new TestCaseVO("소원을 말해봐 genie mixversion##!", "ㅅㅗㅇㅜㅓㄴㅇㅡㄹㅁㅏㄹㅎㅐㅂㅗㅏgeniemixversion##!/ㅅㅇㅇㅁㅎㅂ")); - testCase.add(new TestCaseVO("hush hush; hush hush", "hushhush;hushhush")); - testCase.add(new TestCaseVO("라벨:모음곡(거울)제4곡(어릿광대의 아침노래)(서현)", "ㄹㅏㅂㅔㄹ:ㅁㅗㅇㅡㅁㄱㅗㄱ(ㄱㅓㅇㅜㄹ)ㅈㅔ4ㄱㅗㄱ(ㅇㅓㄹㅣㅅㄱㅗㅏㅇㄷㅐㅇㅡㅣㅇㅏㅊㅣㅁㄴㅗㄹㅐ)(ㅅㅓㅎㅕㄴ)/ㄹㅂㅁㅇㄱㄱㅇㅈㄱㅇㄹㄱㄷㅇㅇㅊㄴㄹㅅㅎ")); - testCase.add(new TestCaseVO("ㅗ디ㅣㅐ", "ㅗㄷㅣㅣㅐ")); //hello - testCase.add(new TestCaseVO("째깅", "ㅈㅈㅐㄱㅣㅇ/ㅈㅈㄱ")); //World - testCase.add(new TestCaseVO("ㅣㅐ햣ㄷ초", "ㅣㅐㅎㅑㅅㄷㅊㅗ")); //logitech - testCase.add(new TestCaseVO("퍗므ㅑㅜ", "ㅍㅑㅅㅁㅡㅑㅜ")); //vitamin - testCase.add(new TestCaseVO("랱", "ㄹㅐㅌ")); //fox - testCase.add(new TestCaseVO("ㅅ딛퍄냐ㅐㅜ", "ㅅㄷㅣㄷㅍㅑㄴㅑㅐㅜ")); //television - testCase.add(new TestCaseVO("최", "ㅊㅗㅣ")); - testCase.add(new TestCaseVO("ㅅㄴㅅㄷ", "ㅅㄴㅅㄷ")); //소녀시대 초성검색 - testCase.add(new TestCaseVO("##%@#$%()*&^%$#@!", "##%@#$%()*&^%$#@!")); - testCase.add(new TestCaseVO("2000 최신가요", "2000ㅊㅗㅣㅅㅣㄴㄱㅏㅇㅛ")); - testCase.add(new TestCaseVO("15&", "15&")); - testCase.add(new TestCaseVO("최신가요 1990", "ㅊㅗㅣㅅㅣㄴㄱㅏㅇㅛ1990/ㅊㅅㄱㅇ")); - testCase.add(new TestCaseVO("최신가요&", "ㅊㅗㅣㅅㅣㄴㄱㅏㅇㅛ&/ㅊㅅㄱㅇ")); + testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅈㄱㅎㅍㅇㄷ")); + testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㄷㄷㅅㅇㄷㅎ")); + testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ/ㅌㄷㅇㅌㄱ")); } else if (options.isMistype() == false && options.isChosung() == false) { @@ -108,24 +66,9 @@ public void testTokenizer() throws IOException { testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ")); testCase.add(new TestCaseVO("Hello", "hello")); testCase.add(new TestCaseVO("Hello~", "hello~")); - testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ")); - testCase.add(new TestCaseVO("소원을 말해봐 genie mixversion##!", "ㅅㅗㅇㅜㅓㄴㅇㅡㄹㅁㅏㄹㅎㅐㅂㅗㅏgeniemixversion##!")); - testCase.add(new TestCaseVO("hush hush; hush hush", "hushhush;hushhush")); - testCase.add(new TestCaseVO("라벨:모음곡(거울)제4곡(어릿광대의 아침노래)(서현)", "ㄹㅏㅂㅔㄹ:ㅁㅗㅇㅡㅁㄱㅗㄱ(ㄱㅓㅇㅜㄹ)ㅈㅔ4ㄱㅗㄱ(ㅇㅓㄹㅣㅅㄱㅗㅏㅇㄷㅐㅇㅡㅣㅇㅏㅊㅣㅁㄴㅗㄹㅐ)(ㅅㅓㅎㅕㄴ)")); - testCase.add(new TestCaseVO("ㅗ디ㅣㅐ", "ㅗㄷㅣㅣㅐ")); //hello - testCase.add(new TestCaseVO("째깅", "ㅈㅈㅐㄱㅣㅇ")); //World - testCase.add(new TestCaseVO("ㅣㅐ햣ㄷ초", "ㅣㅐㅎㅑㅅㄷㅊㅗ")); //logitech - testCase.add(new TestCaseVO("퍗므ㅑㅜ", "ㅍㅑㅅㅁㅡㅑㅜ")); //vitamin - testCase.add(new TestCaseVO("랱", "ㄹㅐㅌ")); //fox - testCase.add(new TestCaseVO("ㅅ딛퍄냐ㅐㅜ", "ㅅㄷㅣㄷㅍㅑㄴㅑㅐㅜ")); //television - testCase.add(new TestCaseVO("최", "ㅊㅗㅣ")); - testCase.add(new TestCaseVO("ㅅㄴㅅㄷ", "ㅅㄴㅅㄷ")); //소녀시대 초성검색 - testCase.add(new TestCaseVO("##%@#$%()*&^%$#@!", "##%@#$%()*&^%$#@!")); - testCase.add(new TestCaseVO("2000 최신가요", "2000ㅊㅗㅣㅅㅣㄴㄱㅏㅇㅛ")); - testCase.add(new TestCaseVO("15&", "15&")); - testCase.add(new TestCaseVO("최신가요 1990", "ㅊㅗㅣㅅㅣㄴㄱㅏㅇㅛ1990")); - testCase.add(new TestCaseVO("최신가요&", "ㅊㅗㅣㅅㅣㄴㄱㅏㅇㅛ&")); - + testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ")); + testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ")); + testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ")); } for (TestCaseVO vo : testCase) {