List of usage examples for java.util.regex Matcher find
public boolean find()
From source file:BGrep.java
public static void main(String[] args) { String encodingName = "UTF-8"; // Default to UTF-8 encoding int flags = Pattern.MULTILINE; // Default regexp flags try { // Fatal exceptions are handled after this try block // First, process any options int nextarg = 0; while (args[nextarg].charAt(0) == '-') { String option = args[nextarg++]; if (option.equals("-e")) { encodingName = args[nextarg++]; } else if (option.equals("-i")) { // case-insensitive matching flags |= Pattern.CASE_INSENSITIVE; } else if (option.equals("-s")) { // Strict Unicode processing flags |= Pattern.UNICODE_CASE; // case-insensitive Unicode flags |= Pattern.CANON_EQ; // canonicalize Unicode } else { System.err.println("Unknown option: " + option); usage();//w ww . j a va 2s. c om } } // Get the Charset for converting bytes to chars Charset charset = Charset.forName(encodingName); // Next argument must be a regexp. Compile it to a Pattern object Pattern pattern = Pattern.compile(args[nextarg++], flags); // Require that at least one file is specified if (nextarg == args.length) usage(); // Loop through each of the specified filenames while (nextarg < args.length) { String filename = args[nextarg++]; CharBuffer chars; // This will hold complete text of the file try { // Handle per-file errors locally // Open a FileChannel to the named file FileInputStream stream = new FileInputStream(filename); FileChannel f = stream.getChannel(); // Memory-map the file into one big ByteBuffer. This is // easy but may be somewhat inefficient for short files. ByteBuffer bytes = f.map(FileChannel.MapMode.READ_ONLY, 0, f.size()); // We can close the file once it is is mapped into memory. // Closing the stream closes the channel, too. stream.close(); // Decode the entire ByteBuffer into one big CharBuffer chars = charset.decode(bytes); } catch (IOException e) { // File not found or other problem System.err.println(e); // Print error message continue; // and move on to the next file } // This is the basic regexp loop for finding all matches in a // CharSequence. Note that CharBuffer implements CharSequence. // A Matcher holds state for a given Pattern and text. Matcher matcher = pattern.matcher(chars); while (matcher.find()) { // While there are more matches // Print out details of the match System.out.println(filename + ":" + // file name matcher.start() + ": " + // character pos matcher.group()); // matching text } } } // These are the things that can go wrong in the code above catch (UnsupportedCharsetException e) { // Bad encoding name System.err.println("Unknown encoding: " + encodingName); } catch (PatternSyntaxException e) { // Bad pattern System.err.println("Syntax error in search pattern:\n" + e.getMessage()); } catch (ArrayIndexOutOfBoundsException e) { // Wrong number of arguments usage(); } }
From source file:eu.annocultor.converters.geonames.GeonamesDumpToRdf.java
public static void main(String[] args) throws Exception { File root = new File("input_source"); // load country-continent match countryToContinent/*from ww w .jav a 2s. c o m*/ .load((new GeonamesDumpToRdf()).getClass().getResourceAsStream("/country-to-continent.properties")); // creating files Map<String, BufferedWriter> files = new HashMap<String, BufferedWriter>(); Map<String, Boolean> started = new HashMap<String, Boolean>(); for (Object string : countryToContinent.keySet()) { String continent = countryToContinent.getProperty(string.toString()); File dir = new File(root, continent); if (!dir.exists()) { dir.mkdir(); } files.put(string.toString(), new BufferedWriter(new OutputStreamWriter( new FileOutputStream(new File(root, continent + "/" + string + ".rdf")), "UTF-8"))); System.out.println(continent + "/" + string + ".rdf"); started.put(string.toString(), false); } System.out.println(started); Pattern countryPattern = Pattern .compile("<inCountry rdf\\:resource\\=\"http\\://www\\.geonames\\.org/countries/\\#(\\w\\w)\"/>"); long counter = 0; LineIterator it = FileUtils.lineIterator(new File(root, "all-geonames-rdf.txt"), "UTF-8"); try { while (it.hasNext()) { String text = it.nextLine(); if (text.startsWith("http://sws.geonames")) continue; // progress counter++; if (counter % 100000 == 0) { System.out.print("*"); } // System.out.println(counter); // get country String country = null; Matcher matcher = countryPattern.matcher(text); if (matcher.find()) { country = matcher.group(1); } // System.out.println(country); if (country == null) country = "null"; text = text.replace("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?><rdf:RDF", "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?><rdf:RDF"); if (started.get(country) == null) throw new Exception("Unknow country " + country); if (started.get(country).booleanValue()) { // remove RDF opening text = text.substring(text.indexOf("<rdf:RDF ")); text = text.substring(text.indexOf(">") + 1); } // remove RDF ending text = text.substring(0, text.indexOf("</rdf:RDF>")); files.get(country).append(text + "\n"); if (!started.get(country).booleanValue()) { // System.out.println("Started with country " + country); } started.put(country, true); } } finally { LineIterator.closeQuietly(it); } for (Object string : countryToContinent.keySet()) { boolean hasStarted = started.get(string.toString()).booleanValue(); if (hasStarted) { BufferedWriter bf = files.get(string.toString()); bf.append("</rdf:RDF>"); bf.flush(); bf.close(); } } return; }
From source file:gobblin.util.CLIPasswordEncryptor.java
public static void main(String[] args) throws ParseException { CommandLine cl = parseArgs(args);/* www . j ava2 s .com*/ if (shouldPrintUsageAndExit(cl)) { printUsage(); return; } String masterPassword = getMasterPassword(cl); TextEncryptor encryptor = getEncryptor(cl, masterPassword); if (cl.hasOption(ENCRYPTED_PWD_OPTION)) { Matcher matcher = ENCRYPTED_PATTERN.matcher(cl.getOptionValue(ENCRYPTED_PWD_OPTION)); if (matcher.find()) { String encrypted = matcher.group(1); System.out.println(encryptor.decrypt(encrypted)); } else { throw new RuntimeException("Input encrypted password does not match pattern \"ENC(...)\""); } } else if (cl.hasOption(PLAIN_PWD_OPTION)) { System.out.println("ENC(" + encryptor.encrypt(cl.getOptionValue(PLAIN_PWD_OPTION)) + ")"); } else { printUsage(); throw new RuntimeException( String.format("Must provide -%s or -%s option.", PLAIN_PWD_OPTION, ENCRYPTED_PWD_OPTION)); } }
From source file:WordCount.java
public static void main(String args[]) throws Exception { String filename = "WordCount.java"; // Map File from filename to byte buffer FileInputStream input = new FileInputStream(filename); FileChannel channel = input.getChannel(); int fileLength = (int) channel.size(); MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileLength); // Convert to character buffer Charset charset = Charset.forName("ISO-8859-1"); CharsetDecoder decoder = charset.newDecoder(); CharBuffer charBuffer = decoder.decode(buffer); // Create line pattern Pattern linePattern = Pattern.compile(".*$", Pattern.MULTILINE); // Create word pattern Pattern wordBreakPattern = Pattern.compile("[\\p{Punct}\\s}]"); // Match line pattern to buffer Matcher lineMatcher = linePattern.matcher(charBuffer); Map map = new TreeMap(); Integer ONE = new Integer(1); // For each line while (lineMatcher.find()) { // Get line CharSequence line = lineMatcher.group(); // Get array of words on line String words[] = wordBreakPattern.split(line); // For each word for (int i = 0, n = words.length; i < n; i++) { if (words[i].length() > 0) { Integer frequency = (Integer) map.get(words[i]); if (frequency == null) { frequency = ONE;//from w w w . j a v a 2 s . c o m } else { int value = frequency.intValue(); frequency = new Integer(value + 1); } map.put(words[i], frequency); } } } System.out.println(map); }
From source file:com.github.liyp.test.TestMain.java
@SuppressWarnings("unchecked") public static void main(String[] args) { // add a shutdown hook to stop the server Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() { @Override//from w w w .j a va2 s .c o m public void run() { System.out.println("########### shoutdown begin...."); try { Thread.sleep(10000); } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("########### shoutdown end...."); } })); System.out.println(args.length); Iterator<String> iterator1 = IteratorUtils .arrayIterator(new String[] { "one", "two", "three", "11", "22", "AB" }); Iterator<String> iterator2 = IteratorUtils.arrayIterator(new String[] { "a", "b", "c", "33", "ab", "aB" }); Iterator<String> chainedIter = IteratorUtils.chainedIterator(iterator1, iterator2); System.out.println("=================="); Iterator<String> iter = IteratorUtils.filteredIterator(chainedIter, new Predicate() { @Override public boolean evaluate(Object arg0) { System.out.println("xx:" + arg0.toString()); String str = (String) arg0; return str.matches("([a-z]|[A-Z]){2}"); } }); while (iter.hasNext()) { System.out.println(iter.next()); } System.out.println("==================="); System.out.println("asas".matches("[a-z]{4}")); System.out.println("Y".equals(null)); System.out.println(String.format("%02d", 1000L)); System.out.println(ArrayUtils.toString(splitAndTrim(" 11, 21,12 ,", ","))); System.out.println(new ArrayList<String>().toString()); JSONObject json = new JSONObject("{\"keynull\":null}"); json.put("bool", false); json.put("keya", "as"); json.put("key2", 2212222222222222222L); System.out.println(json); System.out.println(json.get("keynull").equals(null)); String a = String.format("{\"id\":%d,\"method\":\"testCrossSync\"," + "\"circle\":%d},\"isEnd\":true", 1, 1); System.out.println(a.getBytes().length); System.out.println(new String[] { "a", "b" }); System.out.println(new JSONArray("[\"aa\",\"\"]")); String data = String.format("%9d %s", 1, RandomStringUtils.randomAlphanumeric(10)); System.out.println(data.getBytes().length); System.out.println(ArrayUtils.toString("1|2| 3| 333||| 3".split("\\|"))); JSONObject j1 = new JSONObject("{\"a\":\"11111\"}"); JSONObject j2 = new JSONObject(j1.toString()); j2.put("b", "22222"); System.out.println(j1 + " | " + j2); System.out.println("======================"); String regex = "\\d+(\\-\\d+){2} \\d+(:\\d+){2}"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher("2015-12-28 15:46:14 _NC250_MD:motion de\n"); String eventDate = matcher.find() ? matcher.group() : ""; System.out.println(eventDate); }
From source file:com.wittawat.wordseg.Main.java
public static void main(String[] args) throws Exception { Console con = System.console(); if (con == null) { System.out.println("The system must support console to run the program."); System.exit(1);/*from w ww . ja va 2 s .com*/ } // Load model System.out.println("Loading model ..."); Classifier model = Data.getDefaultModel(); System.out.println("Finished loading model."); System.out.println(getAgreement()); boolean isUseDict = true; // Dummy statement to eliminate all lazy loading System.out.println("\n" + new NukeTokenizer3( "?????", model, isUseDict).tokenize() + "\n"); System.out.println(getHelp()); final String SET_DICT_PAT_STR = "\\s*set\\s+dict\\s+(true|false)\\s*"; final Pattern SET_DICT_PAT = Pattern.compile(SET_DICT_PAT_STR); while (true) { System.out.print(">> "); String line = con.readLine(); if (line != null && !line.trim().equals("")) { line = line.trim(); try { if (line.equals("h") || line.equals("help")) { System.out.println(getHelp()); } else if (line.equals("about")) { System.out.println(getAbout()); } else if (line.equals("agreement")) { System.out.println(getAgreement()); } else if (SET_DICT_PAT.matcher(line).find()) { Matcher m = SET_DICT_PAT.matcher(line); m.find(); String v = m.group(1); isUseDict = v.equals("true"); System.out.println("Dictionary will " + (isUseDict ? "" : "not ") + "be used."); } else if (line.matches("q|quit|exit")) { System.out.println("Bye"); System.exit(0); } else if (line.contains(":tokfile:")) { String[] splits = line.split(":tokfile:"); String in = splits[0]; String out = splits[1]; String content = FileUtils.readFileToString(new File(in)); long start = new Date().getTime(); NukeTokenizer tokenizer = new NukeTokenizer3(content, model, isUseDict); String tokenized = tokenizer.tokenize(); long end = new Date().getTime(); System.out.println("Time to tokenize: " + (end - start) + " ms."); FileUtils.writeStringToFile(new File(out), tokenized); } else if (line.contains(":tokfile")) { String[] splits = line.split(":tokfile"); String in = splits[0]; String content = FileUtils.readFileToString(new File(in)); long start = new Date().getTime(); NukeTokenizer tokenizer = new NukeTokenizer3(content, model, isUseDict); String tokenized = tokenizer.tokenize(); long end = new Date().getTime(); System.out.println(tokenized); System.out.println("Time to tokenize: " + (end - start) + " ms."); } else if (line.contains(":tok:")) { String[] splits = line.split(":tok:"); String inText = splits[0]; String out = splits[1]; long start = new Date().getTime(); NukeTokenizer tokenizer = new NukeTokenizer3(inText, model, isUseDict); String tokenized = tokenizer.tokenize(); long end = new Date().getTime(); System.out.println("Time to tokenize: " + (end - start) + " ms."); FileUtils.writeStringToFile(new File(out), tokenized); } else if (line.contains(":tok")) { String[] splits = line.split(":tok"); String inText = splits[0]; long start = new Date().getTime(); NukeTokenizer tokenizer = new NukeTokenizer3(inText, model, isUseDict); String tokenized = tokenizer.tokenize(); long end = new Date().getTime(); System.out.println(tokenized); System.out.println("Time to tokenize: " + (end - start) + " ms."); } else { System.out.println("Unknown command"); } } catch (Exception e) { System.out.println("Error. See the exception."); e.printStackTrace(); } } } }
From source file:com.hp.avmon.trap.service.TrapService.java
public static void main(String[] args) { String text = "{3}123{3}{10}"; Pattern p = Pattern.compile(".*?(\\{.+?\\})"); Matcher m = p.matcher(text); while (m.find()) { System.out.println(m.group(1)); }//ww w. j av a 2s . co m }
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step5LinguisticPreprocessing.java
public static void main(String[] args) throws Exception { // input dir - list of xml query containers // step4-boiler-plate/ File inputDir = new File(args[0]); // output dir File outputDir = new File(args[1]); if (!outputDir.exists()) { outputDir.mkdirs();/*w w w . j a v a2 s . c o m*/ } // iterate over query containers for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) { QueryResultContainer queryResultContainer = QueryResultContainer .fromXML(FileUtils.readFileToString(f, "utf-8")); for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) { // System.out.println(rankedResults.plainText); if (rankedResults.plainText != null) { String[] lines = StringUtils.split(rankedResults.plainText, "\n"); // collecting all cleaned lines List<String> cleanLines = new ArrayList<>(lines.length); // collecting line tags List<String> lineTags = new ArrayList<>(lines.length); for (String line : lines) { // get the tag String tag = null; Matcher m = OPENING_TAG_PATTERN.matcher(line); if (m.find()) { tag = m.group(1); } if (tag == null) { throw new IllegalArgumentException("No html tag found for line:\n" + line); } // replace the tag at the beginning and the end String noTagText = line.replaceAll("^<\\S+>", "").replaceAll("</\\S+>$", ""); // do some html cleaning noTagText = noTagText.replaceAll(" ", " "); noTagText = noTagText.trim(); // add to the output if (!noTagText.isEmpty()) { cleanLines.add(noTagText); lineTags.add(tag); } } if (cleanLines.isEmpty()) { // the document is empty System.err.println("Document " + rankedResults.clueWebID + " in query " + queryResultContainer.qID + " is empty"); } else { // now join them back to paragraphs String text = StringUtils.join(cleanLines, "\n"); // create JCas JCas jCas = JCasFactory.createJCas(); jCas.setDocumentText(text); jCas.setDocumentLanguage("en"); // annotate WebParagraph SimplePipeline.runPipeline(jCas, AnalysisEngineFactory.createEngineDescription(WebParagraphAnnotator.class)); // fill the original tag information List<WebParagraph> webParagraphs = new ArrayList<>( JCasUtil.select(jCas, WebParagraph.class)); // they must be the same size as original ones if (webParagraphs.size() != lineTags.size()) { throw new IllegalStateException( "Different size of annotated paragraphs and original lines"); } for (int i = 0; i < webParagraphs.size(); i++) { WebParagraph p = webParagraphs.get(i); // get tag String tag = lineTags.get(i); p.setOriginalHtmlTag(tag); } SimplePipeline.runPipeline(jCas, AnalysisEngineFactory.createEngineDescription(StanfordSegmenter.class, // only on existing WebParagraph annotations StanfordSegmenter.PARAM_ZONE_TYPES, WebParagraph.class.getCanonicalName())); // now convert to XMI ByteArrayOutputStream byteOutputStream = new ByteArrayOutputStream(); XmiCasSerializer.serialize(jCas.getCas(), byteOutputStream); // encode to base64 String encoded = new BASE64Encoder().encode(byteOutputStream.toByteArray()); rankedResults.originalXmi = encoded; } } } // and save the query to output dir File outputFile = new File(outputDir, queryResultContainer.qID + ".xml"); FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8"); System.out.println("Finished " + outputFile); } }
From source file:edu.illinois.cs.cogcomp.ner.BenchmarkOutputParser.java
/** * This main method will take one required argument, idenfitying the file containing * the results. Optionally, "-single" may also be passed indicating it will extract * the F1 value for single token values only. * @param args//from w ww .java 2s . c o m * @throws IOException */ public static void main(String[] args) throws IOException { parseArgs(args); System.out.println("L1lr,L1t,L2lr,L2t,L1 token,L2 token,F1,F2"); for (File file : resultsfile.listFiles()) { if (file.getName().startsWith("L1r")) { File resultsfile = new File(file, "ner/results.out"); if (resultsfile.exists()) { try { Parameters p = parseFilename(file); String lines = FileUtils.readFileToString(resultsfile); // get the token level score. String tokenL2 = null, tokenL1 = null; Matcher matcher = l2tokenlevelpattern.matcher(lines); if (matcher.find()) tokenL2 = matcher.group(1); else { matcher = ol2tokenlevelpattern.matcher(lines); if (matcher.find()) tokenL2 = matcher.group(1); else System.err.println("No token level match"); } matcher = l1tokenlevelpattern.matcher(lines); if (matcher.find()) tokenL1 = matcher.group(1); else { matcher = ol1tokenlevelpattern.matcher(lines); if (matcher.find()) tokenL1 = matcher.group(1); else System.err.println("No token level match"); } matcher = phraselevelpattern.matcher(lines); matcher.find(); String phraseL1 = matcher.group(1); String phraseL2 = matcher.group(2); System.out.println( p.toString() + "," + tokenL1 + "," + tokenL2 + "," + phraseL1 + "," + phraseL2); } catch (java.lang.IllegalStateException ise) { System.err.println("The results file could not be parsed : \"" + resultsfile + "\""); } } else { System.err.println("no results in " + resultsfile); } } } }
From source file:ReplaceDemo.java
public static void main(String[] argv) { // Make an RE pattern to match almost any form (deamon, demon, etc.). String patt = "d[ae]{1,2}mon"; // i.e., 1 or 2 'a' or 'e' any combo // A test input. String input = "Unix hath demons and deamons in it!"; System.out.println("Input: " + input); // Run it from a RE instance and see that it works Pattern r = Pattern.compile(patt); Matcher m = r.matcher(input); System.out.println("ReplaceAll: " + m.replaceAll("daemon")); // Show the appendReplacement method m.reset();// ww w . j a v a2 s. c o m StringBuffer sb = new StringBuffer(); System.out.print("Append methods: "); while (m.find()) { m.appendReplacement(sb, "daemon"); // Copy to before first match, // plus the word "daemon" } m.appendTail(sb); // copy remainder System.out.println(sb.toString()); }