diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b550428..a78083c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -148,6 +148,7 @@ jobs: - uses: actions/checkout@v4 with: + fetch-depth: 0 submodules: true - name: Update apt-get diff --git a/build-and-run b/build-and-run index f369793..9765a2c 100755 --- a/build-and-run +++ b/build-and-run @@ -76,8 +76,8 @@ simple.example() { run "cd output/bin/" run "cat ${ProjectRoot}/example.sql ${ProjectRoot}/example-jieba.sql | ./sqlite3" run "./simple_cpp_example" - run "cd ${ProjectRoot}" - run "python3 examples/python3/db_connector.py './output/bin/libsimple'" + # run "cd ${ProjectRoot}" + # run "python3 examples/python3/db_connector.py './output/bin/libsimple'" } main() { diff --git a/build-and-run-no-jieba b/build-and-run-no-jieba index 424b8ec..f262468 100755 --- a/build-and-run-no-jieba +++ b/build-and-run-no-jieba @@ -76,8 +76,8 @@ simple.example() { run "cd output-no-jieba/bin/" run "./sqlite3 < ${ProjectRoot}/example.sql" run "./simple_cpp_example" - run "cd ${ProjectRoot}" - run "python3 examples/python3/db_connector.py" + # run "cd ${ProjectRoot}" + # run "python3 examples/python3/db_connector.py" } main() { diff --git a/src/simple_tokenizer.cc b/src/simple_tokenizer.cc index 983670e..3d7bb7d 100644 --- a/src/simple_tokenizer.cc +++ b/src/simple_tokenizer.cc @@ -73,7 +73,16 @@ std::string SimpleTokenizer::tokenize_jieba_query(const char *text, int textLen, std::vector words; jieba.Cut(text, words); for (auto word : words) { + // if all char is the same category, then use that category + // otherwise use OTHER + // fix https://github.com/wangfenjin/simple/issues/176 TokenCategory category = from_char(text[word.offset]); + for (auto c : word.word) { + if (from_char(c) != category) { + category = TokenCategory::OTHER; + break; + } + } append_result(result, word.word, category, word.offset, flags); } return result; diff --git a/test/tokenizer_test.cc b/test/tokenizer_test.cc index 0246cc3..6f6f4f3 100644 --- a/test/tokenizer_test.cc +++ b/test/tokenizer_test.cc @@ -18,6 +18,8 @@ TEST(simple, tokenizer_with_pinyin) { query.push_back(R"VAGON(( z+h+o+u* OR zhou* ) AND "杰" AND "伦")VAGON"); arr.push_back("杰伦 zhou 123"); query.push_back(R"VAGON("杰" AND "伦" AND ( z+h+o+u* OR zhou* ) AND "123"*)VAGON"); + arr.push_back("c#"); + query.push_back(R"VAGON(c* AND "#")VAGON"); for (int i = 0; i < arr.size(); i++) { std::string s = arr[i]; std::cout << s << " as doc:\n"; @@ -66,6 +68,8 @@ TEST(simple, jieba_tokenizer_with_pinyin) { query.push_back(R"VAGON(( z+h+o+u* OR zhou* ) AND "杰伦")VAGON"); arr.push_back("杰伦 zhou 123"); query.push_back(R"VAGON("杰伦" AND ( z+h+o+u* OR zhou* ) AND "123"*)VAGON"); + arr.push_back("c#"); + query.push_back(R"VAGON("c#")VAGON"); for (int i = 0; i < arr.size(); i++) { std::string s = arr[i]; std::cout << s << " as doc:\n";