Here you can find the source of splitTerms(String queryString)
private static final String[] splitTerms(String queryString)
//package com.java2s; /*// w w w. j a va 2 s . c o m Copyright 2009 Semantic Discovery, Inc. (www.semanticdiscovery.com) This file is part of the Semantic Discovery Toolkit. The Semantic Discovery Toolkit is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. The Semantic Discovery Toolkit is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with The Semantic Discovery Toolkit. If not, see <http://www.gnu.org/licenses/>. */ import java.util.ArrayList; import java.util.List; public class Main { private static final String[] splitTerms(String queryString) { // essentially "queryString.split("[^\\w\\d:+^-]+");", which doesn't work // when there are asian chars. final List<String> result = new ArrayList<String>(); final StringBuilder builder = new StringBuilder(); final int len = queryString.length(); for (int i = 0; i < len; ++i) { final int cp = queryString.codePointAt(i); if (!Character.isLetterOrDigit(cp) && cp != ':' && cp != '+' && cp != '^' && cp != '-') { if (builder.length() > 0) { result.add(builder.toString()); builder.setLength(0); } } else { builder.appendCodePoint(cp); } } if (builder.length() > 0) { result.add(builder.toString()); } return result.toArray(new String[result.size()]); } }