List of usage examples for java.lang Character isAlphabetic
public static boolean isAlphabetic(int codePoint)
From source file:org.structr.web.common.DownloadHelper.java
public static GraphObjectMap getContextObject(final String searchTerm, final String text, final int contextLength) { final GraphObjectMap contextObject = new GraphObjectMap(); final Set<String> contextValues = new LinkedHashSet<>(); final String[] searchParts = searchTerm.split("[\\s,;]+"); final GenericProperty contextKey = new GenericProperty("context"); for (final String searchString : searchParts) { final String lowerCaseSearchString = searchString.toLowerCase(); final String lowerCaseText = text.toLowerCase(); final StringBuilder wordBuffer = new StringBuilder(); final StringBuilder lineBuffer = new StringBuilder(); final int textLength = text.length(); /*//from www . jav a2 s.co m * we take an average word length of 8 characters, multiply * it by the desired prefix and suffix word count, add 20% * and try to extract up to prefixLength words. */ // modify these parameters to tune prefix and suffix word extraction // loop variables int newlineCount = 0; int wordCount = 0; // wordCount starts at 1 because we include the matching word int pos = -1; do { // find next occurrence pos = lowerCaseText.indexOf(lowerCaseSearchString, pos + 1); if (pos > 0) { lineBuffer.setLength(0); wordBuffer.setLength(0); wordCount = 0; newlineCount = 0; // fetch context words before search hit for (int i = pos; i >= 0; i--) { final char c = text.charAt(i); if (!Character.isAlphabetic(c) && !Character.isDigit(c) && !FulltextTokenizer.SpecialChars.contains(c)) { wordCount += flushWordBuffer(lineBuffer, wordBuffer, true); // store character in buffer wordBuffer.insert(0, c); if (c == '\n') { // increase newline count newlineCount++; } else { // reset newline count newlineCount = 0; } // paragraph boundary reached if (newlineCount > 1) { break; } // stop if we collected half of the desired word count if (wordCount > contextLength / 2) { break; } } else { // store character in buffer wordBuffer.insert(0, c); // reset newline count newlineCount = 0; } } wordCount += flushWordBuffer(lineBuffer, wordBuffer, true); wordBuffer.setLength(0); // fetch context words after search hit for (int i = pos + 1; i < textLength; i++) { final char c = text.charAt(i); if (!Character.isAlphabetic(c) && !Character.isDigit(c) && !FulltextTokenizer.SpecialChars.contains(c)) { wordCount += flushWordBuffer(lineBuffer, wordBuffer, false); // store character in buffer wordBuffer.append(c); if (c == '\n') { // increase newline count newlineCount++; } else { // reset newline count newlineCount = 0; } // paragraph boundary reached if (newlineCount > 1) { break; } // stop if we collected enough words if (wordCount > contextLength) { break; } } else { // store character in buffer wordBuffer.append(c); // reset newline count newlineCount = 0; } } wordCount += flushWordBuffer(lineBuffer, wordBuffer, false); // replace single newlines with space contextValues.add(lineBuffer.toString().trim()); } } while (pos >= 0); } contextObject.put(contextKey, contextValues); return contextObject; }
From source file:org.wikipedia.nirvana.archive.ArchiveWithHeadersWithItemsCount.java
public String headerVariableToConstant(String header, IntAndString data) { String str = header;/*from ww w . ja v a2s. c o m*/ int pos = 0; for (HeaderFormatItem item : patternOfHeader) { if (item.period == Period.NONE) { if (item.string.equals(template)) { int start = pos; while (Character.isDigit(str.charAt(pos))) pos++; str = header.substring(0, start) + template + str.substring(pos); if (data != null) { data.val = Integer.parseInt(header.substring(start, pos)); data.str = str; } return str; } else if (item.string.length() > 0) { pos += item.string.length(); } else { return str; } } else { if (item.period.isNumeric()) { while (Character.isDigit(str.charAt(pos))) pos++; } else { while (Character.isAlphabetic(str.charAt(pos))) pos++; } } } return str; }
From source file:org.yasmin.core.config.parser.KeyValueToken.java
/** * Checks if the given line is a valid keypair line. * /*from w ww . j a va 2 s. c om*/ * @param line * The line to be verified * @return <code>true</code> if the line is a valid keypair line. */ private boolean isValid(String line) { if (!Character.isAlphabetic(line.charAt(0))) { return false; } // Line must not end with { if (StringUtils.strip(line).endsWith("{")) { return false; } int equalIndex = line.indexOf("="); // Starts with a character... it must have an equal sign... if (equalIndex == -1) { return false; } // Equal sign found... Let's validate the key... String keyName = line.substring(0, equalIndex); if (!keyName.matches(KEY_REGEX)) { return false; } return true; }
From source file:qa.ProcessFrameProcessor.java
public void toClearParserFormat(String clearParserFileName) throws FileNotFoundException, IOException { ArrayList<ProcessFrame> processFrames = getProcArr(); PrintWriter writer = new PrintWriter(clearParserFileName); for (ProcessFrame p : processFrames) { String rawText = p.getRawText(); rawText = rawText.replace(".", " "); rawText = rawText.replaceAll("\"", ""); rawText = rawText.trim();/* www.ja va 2 s . c o m*/ for (int j = rawText.length() - 1;; j--) { if (Character.isAlphabetic(rawText.charAt(j))) { rawText = rawText.substring(0, j + 1); rawText += "."; break; } } /*rawText = rawText.replace(".", " "); rawText = rawText.replaceAll("\"", ""); rawText = rawText.trim(); rawText += ".";**/ // update tokenized text here List<String> tokenized = slem.tokenize(rawText); p.setTokenizedText(tokenized.toArray(new String[tokenized.size()])); try { DependencyTree tree = depParser.parse(rawText); String conLLStr = ClearParserUtil.toClearParserFormat(tree, p); writer.println(conLLStr); writer.println(); } catch (Exception e) { e.printStackTrace(); //System.out.println(rawText); } } writer.close(); }
From source file:qa.ProcessFrameProcessor.java
public void toConLL2009Format(String conll2009FileName) throws FileNotFoundException, IOException { ArrayList<ProcessFrame> processFrames = getProcArr(); PrintWriter writer = new PrintWriter(conll2009FileName); for (ProcessFrame p : processFrames) { String rawText = p.getRawText(); rawText = rawText.replace(".", " "); rawText = rawText.replaceAll("\"", ""); rawText = rawText.trim();/* w w w . j av a2 s. c o m*/ for (int j = rawText.length() - 1;; j--) { if (Character.isAlphabetic(rawText.charAt(j))) { rawText = rawText.substring(0, j + 1); rawText += "."; break; } } /*rawText = rawText.replace(".", " "); rawText = rawText.replaceAll("\"", ""); rawText = rawText.trim(); rawText += ".";**/ // update tokenized text here List<String> tokenized = slem.tokenize(rawText); p.setTokenizedText(tokenized.toArray(new String[tokenized.size()])); try { DependencyTree tree = depParser.parse(rawText); String conLLStr = ClearParserUtil.toCONLL2009Format(tree, p); writer.println(conLLStr); //writer.println(); } catch (Exception e) { e.printStackTrace(); //System.out.println(rawText); } } writer.close(); }