List of usage examples for org.apache.commons.io IOUtils lineIterator
public static LineIterator lineIterator(InputStream input, String encoding) throws IOException
InputStream
, using the character encoding specified (or default encoding if null). From source file:de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoTsv2Reader.java
/** * Iterate through lines and create span annotations accordingly. For * multiple span annotation, based on the position of the annotation in the * line, update only the end position of the annotation *///from ww w .ja va2s.c o m private void setAnnotations(JCas aJcas, InputStream aIs, String aEncoding, StringBuilder text) throws IOException { // getting header information LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding); int columns = 1;// token number + token columns (minimum required) int tokenStart = 0, sentenceStart = 0; Map<Type, Set<Feature>> spanLayers = new LinkedHashMap<Type, Set<Feature>>(); Map<Type, Type> relationayers = new LinkedHashMap<Type, Type>(); // an annotation for every feature in a layer Map<Type, Map<Integer, AnnotationFS>> annotations = new LinkedHashMap<Type, Map<Integer, AnnotationFS>>(); // store if this is a Begin/Intermediate/End of an annotation Map<Type, Map<Integer, String>> beginEndAnno = new LinkedHashMap<Type, Map<Integer, String>>(); // Store annotations of tokens so that it can be used later for relation // annotations Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations = new LinkedHashMap<Type, Map<String, List<AnnotationFS>>>(); // store target token ids used for a relation Map<Type, Map<String, List<String>>> relationTargets = new LinkedHashMap<Type, Map<String, List<String>>>(); // store tokens indexing with the concat of itsbegin-end so that lemma // and pos annotation // can be attached, if exists, later indexedTokens = new HashMap<String, Token>(); while (lineIterator.hasNext()) { String line = lineIterator.next().trim(); if (line.trim().equals("") && sentenceStart == tokenStart) { continue; } if (line.trim().equals("")) { text.replace(tokenStart - 1, tokenStart, ""); tokenStart = tokenStart - 1; Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); tokenStart++; sentenceStart = tokenStart; text.append("\n"); continue; } // sentence if (line.startsWith("#text=")) { continue; } if (line.startsWith("#id=")) { continue;// it is a comment line } if (line.startsWith("#")) { columns = getLayerAndFeature(aJcas, columns, spanLayers, relationayers, line); continue; } // some times, the sentence in #text= might have a new line which // break this reader, // so skip such lines if (!Character.isDigit(line.split(" ")[0].charAt(0))) { continue; } // If we are still unlucky, the line starts with a number from the // sentence but not // a token number, check if it didn't in the format NUM-NUM if (!Character.isDigit(line.split("-")[1].charAt(0))) { continue; } int count = StringUtils.countMatches(line, "\t"); if (columns != count) { throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line); } // adding tokens and sentence StringTokenizer lineTk = new StringTokenizer(line, "\t"); String tokenNumberColumn = lineTk.nextToken(); String tokenColumn = lineTk.nextToken(); Token token = new Token(aJcas, tokenStart, tokenStart + tokenColumn.length()); token.addToIndexes(); Type posType = JCasUtil.getType(aJcas, POS.class); Type lemmaType = JCasUtil.getType(aJcas, Lemma.class); if (spanLayers.containsKey(posType) || spanLayers.containsKey(lemmaType)) { indexedTokens.put(tokenStart + "-" + tokenStart + tokenColumn.length(), token); } // adding the annotations createSpanAnnotation(aJcas, tokenStart, spanLayers, relationayers, annotations, beginEndAnno, tokenAnnotations, relationTargets, lineTk, tokenColumn, tokenNumberColumn); tokenStart = tokenStart + tokenColumn.length() + 1; text.append(tokenColumn + " "); } if (tokenStart > sentenceStart) { Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); text.append("\n"); } createRelationLayer(aJcas, relationayers, tokenAnnotations, relationTargets); }
From source file:de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoCustomTsvReader.java
/** * Iterate through lines and create span annotations accordingly. For multiple span annotation, * based on the position of the annotation in the line, update only the end position of the * annotation//from w w w .j a v a 2s . c o m */ private void setAnnotations(JCas aJcas, InputStream aIs, String aEncoding, StringBuilder text) throws IOException { // getting header information LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding); int columns = 1;// token number + token columns (minimum required) int tokenStart = 0, sentenceStart = 0; Map<Type, Set<Feature>> spanLayers = new LinkedHashMap<Type, Set<Feature>>(); Map<Type, Type> relationayers = new LinkedHashMap<Type, Type>(); // an annotation for every feature in a layer Map<Type, Map<Integer, AnnotationFS>> annotations = new LinkedHashMap<Type, Map<Integer, AnnotationFS>>(); // store if this is a Begin/Intermediate/End of an annotation Map<Type, Map<Integer, String>> beginEndAnno = new LinkedHashMap<Type, Map<Integer, String>>(); // Store annotations of tokens so that it can be used later for relation annotations Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations = new LinkedHashMap<Type, Map<String, List<AnnotationFS>>>(); // store target token ids used for a relation Map<Type, Map<String, List<String>>> relationTargets = new LinkedHashMap<Type, Map<String, List<String>>>(); // store tokens indexing with the concat of itsbegin-end so that lemma and pos annotation // can be attached, if exists, later indexedTokens = new HashMap<String, Token>(); while (lineIterator.hasNext()) { String line = lineIterator.next().trim(); if (line.trim().equals("") && sentenceStart == tokenStart) { continue; } if (line.trim().equals("")) { text.replace(tokenStart - 1, tokenStart, ""); tokenStart = tokenStart - 1; Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); tokenStart++; sentenceStart = tokenStart; text.append("\n"); continue; } // sentence if (line.startsWith("#text=")) { continue; } if (line.startsWith("#id=")) { continue;// it is a comment line } if (line.startsWith("#")) { columns = getLayerAndFeature(aJcas, columns, spanLayers, relationayers, line); continue; } // some times, the sentence in #text= might have a new line which break this reader, // so skip such lines if (!Character.isDigit(line.split(" ")[0].charAt(0))) { continue; } // If we are still unlucky, the line starts with a number from the sentence but not // a token number, check if it didn't in the format NUM-NUM if (!Character.isDigit(line.split("-")[1].charAt(0))) { continue; } int count = StringUtils.countMatches(line, "\t"); if (columns != count) { throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line); } // adding tokens and sentence StringTokenizer lineTk = new StringTokenizer(line, "\t"); String tokenNumberColumn = lineTk.nextToken(); String tokenColumn = lineTk.nextToken(); Token token = new Token(aJcas, tokenStart, tokenStart + tokenColumn.length()); token.addToIndexes(); Type posType = JCasUtil.getType(aJcas, POS.class); Type lemmaType = JCasUtil.getType(aJcas, Lemma.class); if (spanLayers.containsKey(posType) || spanLayers.containsKey(lemmaType)) { indexedTokens.put(tokenStart + "-" + tokenStart + tokenColumn.length(), token); } // adding the annotations createSpanAnnotation(aJcas, tokenStart, spanLayers, relationayers, annotations, beginEndAnno, tokenAnnotations, relationTargets, lineTk, tokenColumn, tokenNumberColumn); tokenStart = tokenStart + tokenColumn.length() + 1; text.append(tokenColumn + " "); } if (tokenStart > sentenceStart) { Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); text.append("\n"); } createRelationLayer(aJcas, relationayers, tokenAnnotations, relationTargets); }
From source file:fr.gael.dhus.server.http.webapp.search.controller.SearchController.java
/** * Provides the openSearch description file via /search/description API. * * @param res response//from w w w . ja va 2 s . c o m * @throws IOException if file description cannot be accessed */ @PreAuthorize("hasRole('ROLE_SEARCH')") @RequestMapping(value = "/description") public void search(HttpServletResponse res) throws IOException { String url = configurationManager.getServerConfiguration().getExternalUrl(); if (url != null && url.endsWith("/")) { url = url.substring(0, url.length() - 1); } String long_name = configurationManager.getNameConfiguration().getLongName(); String short_name = configurationManager.getNameConfiguration().getShortName(); String contact_mail = configurationManager.getSupportConfiguration().getMail(); InputStream is = ClassLoader.getSystemResourceAsStream(DESCRIPTION_FILE); if (is == null) { throw new IOException("Cannot find \"" + DESCRIPTION_FILE + "\" OpenSearch description file."); } LineIterator li = IOUtils.lineIterator(is, "UTF-8"); try (ServletOutputStream os = res.getOutputStream()) { while (li.hasNext()) { String line = li.next(); // Last line? -> the iterator eats LF if (li.hasNext()) { line = line + "\n"; } line = line.replace("[dhus_server]", url); if (long_name != null) { line = line.replace("[dhus_long_name]", long_name); } if (short_name != null) { line = line.replace("[dhus_short_name]", short_name); } if (contact_mail != null) { line = line.replace("[dhus_contact_mail]", contact_mail); } os.write(line.getBytes()); } } finally { IOUtils.closeQuietly(is); LineIterator.closeQuietly(li); } }
From source file:com.norconex.collector.http.crawler.HttpCrawler.java
private void queueStartURLs(ICrawlDataStore crawlDataStore) { // Queue regular start urls String[] startURLs = getCrawlerConfig().getStartURLs(); if (startURLs != null) { for (int i = 0; i < startURLs.length; i++) { String startURL = startURLs[i]; executeQueuePipeline(new HttpCrawlData(startURL, 0), crawlDataStore); }/* ww w . j a v a2s .c om*/ } // Queue start urls define in one or more seed files String[] urlsFiles = getCrawlerConfig().getUrlsFiles(); if (urlsFiles != null) { for (int i = 0; i < urlsFiles.length; i++) { String urlsFile = urlsFiles[i]; LineIterator it = null; try { it = IOUtils.lineIterator(new FileInputStream(urlsFile), CharEncoding.UTF_8); while (it.hasNext()) { String startURL = it.nextLine(); executeQueuePipeline(new HttpCrawlData(startURL, 0), crawlDataStore); } } catch (IOException e) { throw new CollectorException("Could not process URLs file: " + urlsFile, e); } finally { LineIterator.closeQuietly(it); ; } } } }
From source file:de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetFactory.java
private Map<String, DatasetDescriptionImpl> loadFromYaml() throws IOException { // Scan for locators PathMatchingResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); Resource[] locators = resolver.getResources("classpath:META-INF/org.dkpro.core/datasets.txt"); // Read locators Set<String> patterns = new LinkedHashSet<>(); for (Resource locator : locators) { try (InputStream is = locator.getInputStream()) { IOUtils.lineIterator(is, "UTF-8").forEachRemaining(l -> patterns.add(l)); }/*from w w w . j ava2 s . c o m*/ } // Scan for YAML dataset descriptions List<Resource> resources = new ArrayList<>(); for (String pattern : patterns) { for (Resource r : resolver.getResources(pattern)) { resources.add(r); } } // Configure YAML deserialization Constructor datasetConstructor = new Constructor(DatasetDescriptionImpl.class); TypeDescription datasetDesc = new TypeDescription(DatasetDescriptionImpl.class); datasetDesc.putMapPropertyType("artifacts", String.class, ArtifactDescriptionImpl.class); datasetDesc.putListPropertyType("licenses", LicenseDescriptionImpl.class); datasetConstructor.addTypeDescription(datasetDesc); TypeDescription artifactDesc = new TypeDescription(ArtifactDescriptionImpl.class); artifactDesc.putListPropertyType("actions", ActionDescriptionImpl.class); datasetConstructor.addTypeDescription(artifactDesc); Yaml yaml = new Yaml(datasetConstructor); // Ensure that there is a fixed order (at least if toString is correctly implemented) Collections.sort(resources, (a, b) -> { return a.toString().compareTo(b.toString()); }); // Load the YAML descriptions Map<String, DatasetDescriptionImpl> sets = new LinkedHashMap<>(); for (Resource res : resources) { LOG.debug("Loading [" + res + "]"); try (InputStream is = res.getInputStream()) { String id = FilenameUtils.getBaseName(res.getFilename()); DatasetDescriptionImpl ds = yaml.loadAs(is, DatasetDescriptionImpl.class); ds.setId(id); ds.setOwner(this); // Inject artifact names into artifacts for (Entry<String, ArtifactDescription> e : ds.getArtifacts().entrySet()) { ((ArtifactDescriptionImpl) e.getValue()).setName(e.getKey()); } sets.put(ds.getId(), ds); } } return sets; }
From source file:net.sf.sprockets.database.sqlite.DbOpenHelper.java
/** * Execute the statements in the resource script on the database. Each statement must end with a * semicolon.//from w w w .j a v a 2 s. c om */ private void execScript(SQLiteDatabase db, Resources res, int script) throws IOException { LineIterator lines = IOUtils.lineIterator(res.openRawResource(script), UTF_8); StringBuilder sql = new StringBuilder(2048); // enough capacity for a long statement try { // read each (potentially multi-line) statement and execute them one at a time while (lines.hasNext()) { String line = lines.next().trim(); int length = line.length(); if (length > 0) { sql.append(line).append("\n"); if (line.charAt(length - 1) == ';') { // statement loaded db.execSQL(sql.toString()); sql.setLength(0); // reset builder for a new statement } } } } finally { lines.close(); } }
From source file:com.daphne.es.showcase.excel.service.ExcelDataService.java
/** * csv?//from w w w.j av a2 s . c om * @param user * @param is */ @Async public void importCvs(final User user, final InputStream is) { ExcelDataService proxy = ((ExcelDataService) AopContext.currentProxy()); BufferedInputStream bis = null; try { long beginTime = System.currentTimeMillis(); bis = new BufferedInputStream(is); String encoding = FileCharset.getCharset(bis); LineIterator iterator = IOUtils.lineIterator(bis, encoding); String separator = ","; int totalSize = 0; //? final List<ExcelData> dataList = Lists.newArrayList(); if (iterator.hasNext()) { iterator.nextLine();// } while (iterator.hasNext()) { totalSize++; String line = iterator.nextLine(); String[] dataArray = StringUtils.split(line, separator); ExcelData data = new ExcelData(); data.setId(Long.valueOf(dataArray[0])); data.setContent(dataArray[1]); dataList.add(data); if (totalSize % batchSize == 0) { try { proxy.doBatchSave(dataList); } catch (Exception e) { Long fromId = dataList.get(0).getId(); Long endId = dataList.get(dataList.size() - 1).getId(); log.error("from " + fromId + " to " + endId + ", error", e); } dataList.clear(); } } if (dataList.size() > 0) { proxy.doBatchSave(dataList); } long endTime = System.currentTimeMillis(); Map<String, Object> context = Maps.newHashMap(); context.put("seconds", (endTime - beginTime) / 1000); notificationApi.notify(user.getId(), "excelImportSuccess", context); } catch (Exception e) { log.error("excel import error", e); Map<String, Object> context = Maps.newHashMap(); context.put("error", e.getMessage()); notificationApi.notify(user.getId(), "excelImportError", context); } finally { IOUtils.closeQuietly(bis); } }
From source file:edu.smu.tspell.wordnet.impl.file.InflectionData.java
/** * Reads the exceptions from a single file that correspond to the * exceptions for a particular synset type. * /*from www . j a v a 2 s .c o m*/ * @param fileName Name of the file to read. * @param type Syntactic type associated with the file. * @throws RetrievalException An error occurred reading the exception data. */ private void loadExceptions(String fileName, SynsetType type) throws IOException { StringTokenizer tokenizer; String inflection; String[] baseForms; String dir = PropertyNames.databaseDirectory; InputStream file = getClass().getResourceAsStream(dir + fileName); LineIterator iterator = IOUtils.lineIterator(file, null); // Loop through all lines in the file while (iterator.hasNext()) { String line = iterator.nextLine(); // Parse the inflected word tokenizer = new StringTokenizer(line, WORD_DELIMITER); inflection = TextTranslator.translateToExternalFormat(tokenizer.nextToken()); // Get the inflected word's base forms baseForms = new String[tokenizer.countTokens()]; for (int i = 0; i < baseForms.length; i++) { baseForms[i] = TextTranslator.translateToExternalFormat(tokenizer.nextToken()); } // Add an entry to the list for this word putMorphology(inflection, baseForms, type); } file.close(); }
From source file:$.MessageLogParser.java
/** * Gets lines which corresponds with specified correlation ID from the specified log file. *//from www . j a v a 2 s. c o m * @param logFile the log file * @param correlationId the correlation ID * @return log lines * @throws IOException when error occurred during file reading */ private List<String> getLogLines(File logFile, String correlationId) throws IOException { List<String> logLines = new ArrayList<String>(); Log.debug("Go through the following log file: " + logFile); int year = Calendar.getInstance().get(Calendar.YEAR); String[] possibleYears = new String[] { String.valueOf(year - 1), String.valueOf(year) }; InputStream stream = null; try { if (logFile.getName().endsWith(GZIP_FILE_EXTENSION)) { stream = new GZIPInputStream(new BufferedInputStream(new FileInputStream(logFile))); } else { stream = new BufferedInputStream(new FileInputStream(logFile)); } LineIterator it = IOUtils.lineIterator(stream, Charset.defaultCharset()); String requestId = null; boolean lastCorrectLine = false; // if previous log line belongs to requestId while (it.hasNext()) { String line = it.nextLine(); if (requestId == null) { if (StringUtils.contains(line, correlationId)) { logLines.add(formatLogLine(line)); // finds requestID requestId = getRequestId(line); if (requestId != null) { Log.debug("correlationId (" + correlationId + ") => requestId (" + requestId + ")"); } } } else { // adds lines with requestID and lines that belongs to previous log record (e.g. XML request) // it's better to check also correlationID because it's not one request ID // for all repeated scheduled jobs for processing of partly failed messages // 2013-05-23 20:22:36,754 [MACHINE_IS_UNDEFINED, ajp-bio-8009-exec-19, /esb/ws/account/v1, ... // <checkCustomerCreditRequest xmlns="cleverbus.org/ws/AccountService-v1"> // <firstName>csd</firstName> // <lastName>acs</lastName> // <birthNumber>111111/1111</birthNumber> // </checkCustomerCreditRequest> if (StringUtils.contains(line, requestId) || (StringUtils.contains(line, correlationId)) || (lastCorrectLine && !StringUtils.startsWithAny(line, possibleYears))) { logLines.add(formatLogLine(line)); lastCorrectLine = true; } else { lastCorrectLine = false; } } } } finally { IOUtils.closeQuietly(stream); } return logLines; }
From source file:com.msopentech.odatajclient.engine.communication.request.batch.ODataBatchUtilities.java
/** * Reads headers from the batch starting from the given position. * <p>//from w w w . j ava 2s. c o m * Retrieved headers will be added to the map given by target parameter. * * @param iterator batch iterator. * @param target destination of the retrieved headers. */ public static void readHeaders(final ODataBatchLineIterator iterator, final Map<String, Collection<String>> target) { try { final ByteArrayOutputStream baos = new ByteArrayOutputStream(); readBatchPart(new ODataBatchController(iterator, null), baos, true); final LineIterator headers = IOUtils.lineIterator(new ByteArrayInputStream(baos.toByteArray()), ODataConstants.UTF8); while (headers.hasNext()) { final String line = headers.nextLine().trim(); if (StringUtils.isNotBlank(line)) { addHeaderLine(line, target); } } } catch (Exception e) { LOG.error("Error retrieving headers", e); throw new IllegalStateException(e); } }