From 1e6aee7ab93f87fbd22774ed74fe75bc06d4ae85 Mon Sep 17 00:00:00 2001 From: Wang Fenjin Date: Sun, 11 May 2025 21:51:36 +0800 Subject: [PATCH 1/4] fix jieba_query issue #176 --- build-and-run | 4 ++-- build-and-run-no-jieba | 4 ++-- src/simple_tokenizer.cc | 9 +++++++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/build-and-run b/build-and-run index f369793..9765a2c 100755 --- a/build-and-run +++ b/build-and-run @@ -76,8 +76,8 @@ simple.example() { run "cd output/bin/" run "cat ${ProjectRoot}/example.sql ${ProjectRoot}/example-jieba.sql | ./sqlite3" run "./simple_cpp_example" - run "cd ${ProjectRoot}" - run "python3 examples/python3/db_connector.py './output/bin/libsimple'" + # run "cd ${ProjectRoot}" + # run "python3 examples/python3/db_connector.py './output/bin/libsimple'" } main() { diff --git a/build-and-run-no-jieba b/build-and-run-no-jieba index 424b8ec..f262468 100755 --- a/build-and-run-no-jieba +++ b/build-and-run-no-jieba @@ -76,8 +76,8 @@ simple.example() { run "cd output-no-jieba/bin/" run "./sqlite3 < ${ProjectRoot}/example.sql" run "./simple_cpp_example" - run "cd ${ProjectRoot}" - run "python3 examples/python3/db_connector.py" + # run "cd ${ProjectRoot}" + # run "python3 examples/python3/db_connector.py" } main() { diff --git a/src/simple_tokenizer.cc b/src/simple_tokenizer.cc index 983670e..3d7bb7d 100644 --- a/src/simple_tokenizer.cc +++ b/src/simple_tokenizer.cc @@ -73,7 +73,16 @@ std::string SimpleTokenizer::tokenize_jieba_query(const char *text, int textLen, std::vector words; jieba.Cut(text, words); for (auto word : words) { + // if all char is the same category, then use that category + // otherwise use OTHER + // fix https://github.com/wangfenjin/simple/issues/176 TokenCategory category = from_char(text[word.offset]); + for (auto c : word.word) { + if (from_char(c) != category) { + category = TokenCategory::OTHER; + break; + } + } append_result(result, word.word, category, word.offset, flags); } return result; From 717dacb11c7749f73829dbe1b70ab3dc6300732b Mon Sep 17 00:00:00 2001 From: Wang Fenjin Date: Sun, 11 May 2025 22:05:05 +0800 Subject: [PATCH 2/4] add tests --- test/tokenizer_test.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/tokenizer_test.cc b/test/tokenizer_test.cc index 0246cc3..6f6f4f3 100644 --- a/test/tokenizer_test.cc +++ b/test/tokenizer_test.cc @@ -18,6 +18,8 @@ TEST(simple, tokenizer_with_pinyin) { query.push_back(R"VAGON(( z+h+o+u* OR zhou* ) AND "杰" AND "伦")VAGON"); arr.push_back("杰伦 zhou 123"); query.push_back(R"VAGON("杰" AND "伦" AND ( z+h+o+u* OR zhou* ) AND "123"*)VAGON"); + arr.push_back("c#"); + query.push_back(R"VAGON(c* AND "#")VAGON"); for (int i = 0; i < arr.size(); i++) { std::string s = arr[i]; std::cout << s << " as doc:\n"; @@ -66,6 +68,8 @@ TEST(simple, jieba_tokenizer_with_pinyin) { query.push_back(R"VAGON(( z+h+o+u* OR zhou* ) AND "杰伦")VAGON"); arr.push_back("杰伦 zhou 123"); query.push_back(R"VAGON("杰伦" AND ( z+h+o+u* OR zhou* ) AND "123"*)VAGON"); + arr.push_back("c#"); + query.push_back(R"VAGON("c#")VAGON"); for (int i = 0; i < arr.size(); i++) { std::string s = arr[i]; std::cout << s << " as doc:\n"; From b7fedd6aa00d4265b82865d15b57c52ec28de464 Mon Sep 17 00:00:00 2001 From: Wang Fenjin Date: Sun, 11 May 2025 22:27:51 +0800 Subject: [PATCH 3/4] update github action --- .github/workflows/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b550428..411fa23 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -56,6 +56,7 @@ jobs: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v4 with: + fetch-depth: 0 submodules: true # - name: Install MSVC problem matcher From 5e3469c37477a6ea9100890997df06ed583d5c3e Mon Sep 17 00:00:00 2001 From: Wang Fenjin Date: Sun, 11 May 2025 22:33:55 +0800 Subject: [PATCH 4/4] update github action --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 411fa23..a78083c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -56,7 +56,6 @@ jobs: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v4 with: - fetch-depth: 0 submodules: true # - name: Install MSVC problem matcher @@ -149,6 +148,7 @@ jobs: - uses: actions/checkout@v4 with: + fetch-depth: 0 submodules: true - name: Update apt-get