Last year, I was asked to add a Chinese Tokenizer feature for our search engine. I studied some already exists libraries and finally found SCWS(Simple Chinese Word Segmentation) which is a open source library and command line tool created by hightman and can be download from github.
Create build scripts:
Start building:
The library and executables:
Basic concepts
http://en.wikipedia.org/wiki/Lexical_analysis#Token http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en http://baike.baidu.com/view/19109.htm [Chinese]
About SCWS
http://www.xunsearch.com/scws/index.php [Chinese]
http://www.xunsearch.com/scws/docs.php [Chinese]
https://github.com/hightman/scws
Create CMake files:
$ cd /home/leer/ws/myscws
$ vim CMakeLists.txt
$ vim /home/leer/ws/myscws/libscws/version.h:
Build libscws and its command line tools using CMake
1. Download scws from github, and checkout tag 1.2.2:
$ git clone https://github.com/hightman/scws.git ~/ws/scws/ $ git tag 1.2.0 1.2.1 1.2.2 $ git checkout 1.2.2
2. Build libscws and command line tools:
Prepare Environment:$ sudo apt-get install cmake $ mkdir /home/leer/ws/myscws $ cd /home/leer/ws/myscws $ cp -rf ~/ws/scws/libscws/ . $ cp -rf ~/ws/scws/cli/ .
Create CMake files:
$ cd /home/leer/ws/myscws
$ vim CMakeLists.txt
cmake_minimum_required(VERSION 2.8) project(myscws) set(myscws_ROOT ${CMAKE_CURRENT_LIST_DIR} CACHE STRING "myscws root directory") add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/libscws ${CMAKE_CURRENT_BINARY_DIR}/libscws) add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/cli ${CMAKE_CURRENT_BINARY_DIR}/cli)
$ vim /home/leer/ws/libscws/CMakeLists.txt
Add some patch to SCWS:project(scws) set(LIBSCWS_ROOT ${PROJECT_SOURCE_DIR} CACHE STRING "LIBSCWS_ROOT") set(LIBSCWS_API_INCLUDES ${LIBSCWS_ROOT} CACHE STRING "LIBSCWS_API_INCLUDES") include_directories(${LIBSCWS_API_INCLUDES}) file(GLOB LIBSCWS_HEADER_FILES ${PROJECT_SOURCE_DIR}/*.h) file(GLOB LIBSCWS_SOURCE_FILES ${PROJECT_SOURCE_DIR}/*.c) add_library(${PROJECT_NAME} STATIC ${LIBSCWS_SOURCE_FILES} ${LIBSCWS_HEADER_FILES}) target_link_libraries(${PROJECT_NAME} m)
$ vim /home/leer/ws/cli/CMakeLists.txt
project(scws_cli) set(SCWS_CMD_ROOT ${PROJECT_SOURCE_DIR} CACHE STRING "SCWS_CMD_ROOT") include_directories(${LIBSCWS_API_INCLUDES}) file(GLOB SCWS_CLI_CMD_SOURCE_FILES ${PROJECT_SOURCE_DIR}/scws_cmd.c) add_executable(${PROJECT_NAME}_cmd ${SCWS_CLI_CMD_SOURCE_FILES}) target_link_libraries(${PROJECT_NAME}_cmd scws) file(GLOB SCWS_CLI_GEN_DICT_SOURCE_FILES ${PROJECT_SOURCE_DIR}/gen_dict.c) add_executable(${PROJECT_NAME}_gen_dict ${SCWS_CLI_GEN_DICT_SOURCE_FILES}) target_link_libraries(${PROJECT_NAME}_gen_dict scws)
$ vim /home/leer/ws/myscws/libscws/version.h:
/* version.h.in. input file for configure */ #ifndef SCWS_VERSION #define SCWS_VERSION "1.2.2" #endif #define SCWS_BUGREPORT "http://www.xunsearch.com/scws" #define PACKAGE_VERSION "UNKNOW"
$ vim /home/leer/ws/myscws/cli/gen_dict.c
#include "scws.h"
$ vim build.sh #!/bin/bash mkdir -p Build && cd Build && rm -rf && cmake .. && make && cd -
Start building:
$ chmod 777 build.sh $ ./build.sh
The library and executables:
$ find . -name "libscws*" -type f ./Build/libscws/libscws.a $ find . -name "scws_cli*" -type f ./Build/cli/scws_cli_cmd ./Build/cli/scws_cli_gen_dict
No comments:
Post a Comment