Friday, September 19, 2014

Chinese sentence tokenization using SCWS

Last year, I was asked to add a Chinese Tokenizer feature for our search engine. I studied some already exists libraries and finally found SCWS(Simple Chinese Word Segmentation) which is a open source library and command line tool created by hightman and can be download from github.

Basic concepts


About SCWS

https://github.com/hightman/scws

Build libscws and its command line tools using CMake

1. Download scws from github, and checkout tag 1.2.2:

$ git clone https://github.com/hightman/scws.git ~/ws/scws/
$ git tag
1.2.0
1.2.1
1.2.2
$ git checkout 1.2.2

2. Build libscws and command line tools:

Prepare Environment:
$ sudo apt-get install cmake
$ mkdir /home/leer/ws/myscws
$ cd /home/leer/ws/myscws
$ cp -rf ~/ws/scws/libscws/ .
$ cp -rf ~/ws/scws/cli/ .

Create CMake files:
$ cd /home/leer/ws/myscws
$ vim CMakeLists.txt
cmake_minimum_required(VERSION 2.8)
project(myscws)
set(myscws_ROOT ${CMAKE_CURRENT_LIST_DIR} CACHE STRING "myscws root directory")
add_subdirectory(
 ${CMAKE_CURRENT_SOURCE_DIR}/libscws
 ${CMAKE_CURRENT_BINARY_DIR}/libscws)
add_subdirectory(
 ${CMAKE_CURRENT_SOURCE_DIR}/cli
 ${CMAKE_CURRENT_BINARY_DIR}/cli)


$ vim /home/leer/ws/libscws/CMakeLists.txt
project(scws)
set(LIBSCWS_ROOT ${PROJECT_SOURCE_DIR} CACHE STRING "LIBSCWS_ROOT")
set(LIBSCWS_API_INCLUDES ${LIBSCWS_ROOT} CACHE STRING "LIBSCWS_API_INCLUDES")
include_directories(${LIBSCWS_API_INCLUDES})
file(GLOB LIBSCWS_HEADER_FILES ${PROJECT_SOURCE_DIR}/*.h)
file(GLOB LIBSCWS_SOURCE_FILES ${PROJECT_SOURCE_DIR}/*.c)
add_library(${PROJECT_NAME} STATIC ${LIBSCWS_SOURCE_FILES} ${LIBSCWS_HEADER_FILES})
target_link_libraries(${PROJECT_NAME} m)


$ vim /home/leer/ws/cli/CMakeLists.txt
project(scws_cli)
set(SCWS_CMD_ROOT ${PROJECT_SOURCE_DIR} CACHE STRING "SCWS_CMD_ROOT")
include_directories(${LIBSCWS_API_INCLUDES})
file(GLOB SCWS_CLI_CMD_SOURCE_FILES ${PROJECT_SOURCE_DIR}/scws_cmd.c)
add_executable(${PROJECT_NAME}_cmd ${SCWS_CLI_CMD_SOURCE_FILES})
target_link_libraries(${PROJECT_NAME}_cmd scws)
file(GLOB SCWS_CLI_GEN_DICT_SOURCE_FILES ${PROJECT_SOURCE_DIR}/gen_dict.c)
add_executable(${PROJECT_NAME}_gen_dict ${SCWS_CLI_GEN_DICT_SOURCE_FILES})
target_link_libraries(${PROJECT_NAME}_gen_dict scws)

Add some patch to SCWS:
$ vim /home/leer/ws/myscws/libscws/version.h:
/* version.h.in. input file for configure */
#ifndef SCWS_VERSION
#define SCWS_VERSION "1.2.2"
#endif
#define SCWS_BUGREPORT "http://www.xunsearch.com/scws"
#define PACKAGE_VERSION "UNKNOW"

$ vim /home/leer/ws/myscws/cli/gen_dict.c
#include "scws.h"

Create build scripts:
$ vim build.sh
#!/bin/bash
mkdir -p Build && cd Build && rm -rf && cmake .. && make && cd -

Start building:
$ chmod 777 build.sh
$ ./build.sh

The library and executables:
$ find . -name "libscws*" -type f
./Build/libscws/libscws.a
$ find . -name "scws_cli*" -type f
./Build/cli/scws_cli_cmd
./Build/cli/scws_cli_gen_dict


3. How to use these command line tools:

TODO.

Using SCWS C APIs

TODO.



No comments:

Post a Comment