List of usage examples for java.util List get
E get(int index);
From source file:sdmx.net.service.insee.INSEERESTQueryable.java
public static void main(String args[]) { INSEERESTQueryable registry = new INSEERESTQueryable("FR1", "http://www.bdm.insee.fr/series/sdmx"); List<DataflowType> dfs = registry.listDataflows(); for (int i = 0; i < dfs.size(); i++) { System.out.println(dfs.get(i).getName()); }/*from ww w .ja va 2 s . c om*/ registry.find(dfs.get(0).getStructure()).dump(); }
From source file:com.fun.rrs.common.excel.ExportExcel.java
/** * /*from ww w . j av a 2 s .co m*/ */ public static void main(String[] args) throws Throwable { List<String> headerList = new ArrayList<String>(); for (int i = 1; i <= 10; i++) { headerList.add("" + i); } List<String> dataRowList = new ArrayList<String>(); for (int i = 1; i <= headerList.size(); i++) { dataRowList.add("?" + i); } List<List<String>> dataList = new ArrayList<List<String>>(); for (int i = 1; i <= 100; i++) { dataList.add(dataRowList); } ExportExcel ee = new ExportExcel("", headerList); for (int i = 0; i < dataList.size(); i++) { Row row = ee.addRow(); for (int j = 0; j < dataList.get(i).size(); j++) { ee.addCell(row, j, dataList.get(i).get(j)); } } ee.writeFile("target/export.xlsx"); ee.dispose(); log.debug("Export success."); }
From source file:com.meidusa.venus.benchmark.FileLineRandomData.java
public static void main(String[] args) throws Exception { final FileLineRandomData mapping = new FileLineRandomData(); mapping.setFile(new File("./role.txt")); mapping.init();//from w ww .jav a2s . c om List<Thread> list = new ArrayList<Thread>(); long start = TimeUtil.currentTimeMillis(); for (int j = 0; j < 1; j++) { Thread thread = new Thread() { public void run() { for (int i = 0; i < 1000; i++) { System.out.println(((String[]) mapping.nextData())[1]); } } }; list.add(thread); thread.start(); } for (int i = 0; i < list.size(); i++) { list.get(i).join(); } System.out.println("time=" + (TimeUtil.currentTimeMillis() - start)); }
From source file:de.tudarmstadt.ukp.experiments.argumentation.convincingness.sampling.Step4MTurkOutputCollector.java
@SuppressWarnings("unchecked") public static void main(String[] args) throws Exception { String inputDirWithArgumentPairs = args[0]; File[] resultFiles;//from w w w.j ava 2 s. com if (args[1].contains("*")) { File path = new File(args[1]); File directory = path.getParentFile(); String regex = path.getName().replaceAll("\\*", ""); List<File> files = new ArrayList<>(FileUtils.listFiles(directory, new String[] { regex }, false)); resultFiles = new File[files.size()]; for (int i = 0; i < files.size(); i++) { resultFiles[i] = files.get(i); } } else { // result file is a comma-separated list of CSV files from MTurk String[] split = args[1].split(","); resultFiles = new File[split.length]; for (int i = 0; i < split.length; i++) { resultFiles[i] = new File(split[i]); } } File outputDir = new File(args[2]); if (!outputDir.exists()) { if (!outputDir.mkdirs()) { throw new IOException("Cannot create directory " + outputDir); } } // error if output folder not empty to prevent any confusion by mixing files if (!FileUtils.listFiles(outputDir, null, false).isEmpty()) { throw new IllegalArgumentException("Output dir " + outputDir + " is not empty"); } // collected assignments with empty reason for rejections Set<String> assignmentsWithEmptyReason = new HashSet<>(); // parse with first line as header MTurkOutputReader mTurkOutputReader = new MTurkOutputReader(resultFiles); Collection<File> files = FileUtils.listFiles(new File(inputDirWithArgumentPairs), new String[] { "xml" }, false); if (files.isEmpty()) { throw new IOException("No xml files found in " + inputDirWithArgumentPairs); } // statistics: how many hits with how many assignments ; hit ID / assignments Map<String, Map<String, Integer>> assignmentsPerHits = new HashMap<>(); // collect accept/reject statistics for (Map<String, String> record : mTurkOutputReader) { boolean wasRejected = "Rejected".equals(record.get("assignmentstatus")); String hitID = record.get("hitid"); String hitTypeId = record.get("hittypeid"); if (!wasRejected) { // update statistics if (!assignmentsPerHits.containsKey(hitTypeId)) { assignmentsPerHits.put(hitTypeId, new HashMap<String, Integer>()); } if (!assignmentsPerHits.get(hitTypeId).containsKey(hitID)) { assignmentsPerHits.get(hitTypeId).put(hitID, 0); } assignmentsPerHits.get(hitTypeId).put(hitID, assignmentsPerHits.get(hitTypeId).get(hitID) + 1); } } // statistics: how many hits with how many assignments ; hit ID / assignments Map<String, Integer> approvedAssignmentsPerHit = new HashMap<>(); Map<String, Integer> rejectedAssignmentsPerHit = new HashMap<>(); // collect accept/reject statistics for (Map<String, String> record : mTurkOutputReader) { boolean approved = "Approved".equals(record.get("assignmentstatus")); boolean rejected = "Rejected".equals(record.get("assignmentstatus")); String hitID = record.get("hitid"); if (approved) { // update statistics if (!approvedAssignmentsPerHit.containsKey(hitID)) { approvedAssignmentsPerHit.put(hitID, 0); } approvedAssignmentsPerHit.put(hitID, approvedAssignmentsPerHit.get(hitID) + 1); } else if (rejected) { // update statistics if (!rejectedAssignmentsPerHit.containsKey(hitID)) { rejectedAssignmentsPerHit.put(hitID, 0); } rejectedAssignmentsPerHit.put(hitID, rejectedAssignmentsPerHit.get(hitID) + 1); } else { throw new IllegalStateException( "Unknown state: " + record.get("assignmentstatus") + " HITID: " + hitID); } } // System.out.println("Approved: " + approvedAssignmentsPerHit); // System.out.println("Rejected: " + rejectedAssignmentsPerHit); System.out.println("Approved (values): " + new HashSet<>(approvedAssignmentsPerHit.values())); System.out.println("Rejected (values): " + new HashSet<>(rejectedAssignmentsPerHit.values())); // rejection statistics int totalRejected = 0; for (Map.Entry<String, Integer> rejectionEntry : rejectedAssignmentsPerHit.entrySet()) { totalRejected += rejectionEntry.getValue(); } System.out.println("Total rejections: " + totalRejected); /* // generate .success files for adding more annotations for (File resultFile : resultFiles) { String hitTypeID = mTurkOutputReader.getHitTypeIdForFile().get(resultFile); // assignments for that hittypeid (= file) Map<String, Integer> assignments = assignmentsPerHits.get(hitTypeID); prepareUpdateHITsFiles(assignments, hitTypeID, resultFile); } */ int totalSavedPairs = 0; // load all previously prepared argument pairs for (File file : files) { List<ArgumentPair> argumentPairs = (List<ArgumentPair>) XStreamTools.getXStream().fromXML(file); List<AnnotatedArgumentPair> annotatedArgumentPairs = new ArrayList<>(); for (ArgumentPair argumentPair : argumentPairs) { AnnotatedArgumentPair annotatedArgumentPair = new AnnotatedArgumentPair(argumentPair); // is there such an answer? String key = "Answer." + argumentPair.getId(); // iterate only if there is such column to save time if (mTurkOutputReader.getColumnNames().contains(key)) { // now find the results for (Map<String, String> record : mTurkOutputReader) { if (record.containsKey(key)) { // extract the values AnnotatedArgumentPair.MTurkAssignment assignment = new AnnotatedArgumentPair.MTurkAssignment(); boolean wasRejected = "Rejected".equals(record.get("assignmentstatus")); // only non-rejected (if required) if (!wasRejected) { String hitID = record.get("hitid"); String workerID = record.get("workerid"); String assignmentId = record.get("assignmentid"); try { assignment.setAssignmentAcceptTime( DATE_FORMAT.parse(record.get("assignmentaccepttime"))); assignment.setAssignmentSubmitTime( DATE_FORMAT.parse(record.get("assignmentsubmittime"))); assignment.setHitComment(record.get("Answer.feedback")); assignment.setHitID(hitID); assignment.setTurkID(workerID); assignment.setAssignmentId(assignmentId); // and answer specific fields String valueRaw = record.get(key); // so far the label has had format aXXX_aYYY_a1, aXXX_aYYY_a2, or aXXX_aYYY_equal // strip now only true label String label = valueRaw.split("_")[2]; assignment.setValue(label); String reason = record.get(key + "_reason"); // missing reason if (reason == null) { assignmentsWithEmptyReason.add(assignmentId); } else { assignment.setReason(reason); // get worker's stance String stanceRaw = record.get(key + "_stance"); if (stanceRaw != null) { // parse stance String stance = stanceRaw.split("_stance_")[1]; assignment.setWorkerStance(stance); } // we take maximal 5 assignments Collections.sort(annotatedArgumentPair.mTurkAssignments, new Comparator<AnnotatedArgumentPair.MTurkAssignment>() { @Override public int compare(AnnotatedArgumentPair.MTurkAssignment o1, AnnotatedArgumentPair.MTurkAssignment o2) { return o1.getAssignmentAcceptTime() .compareTo(o2.getAssignmentAcceptTime()); } }); if (annotatedArgumentPair.mTurkAssignments .size() < MAXIMUM_ASSIGNMENTS_PER_HIT) { annotatedArgumentPair.mTurkAssignments.add(assignment); } } } catch (IllegalArgumentException | NullPointerException ex) { System.err.println("Malformed annotations for HIT " + hitID + ", worker " + workerID + ", assignment " + assignmentId + "; " + ex.getMessage() + ", full record: " + record); } } } } } // and if there are some annotations, add it to the result set if (!annotatedArgumentPair.mTurkAssignments.isEmpty()) { annotatedArgumentPairs.add(annotatedArgumentPair); } } if (!annotatedArgumentPairs.isEmpty()) { File outputFile = new File(outputDir, file.getName()); XStreamTools.toXML(annotatedArgumentPairs, outputFile); System.out.println("Saved " + annotatedArgumentPairs.size() + " annotated pairs to " + outputFile); totalSavedPairs += annotatedArgumentPairs.size(); } } System.out.println("Total saved " + totalSavedPairs + " pairs"); // print assignments with empty reasons if (!assignmentsWithEmptyReason.isEmpty()) { System.out.println( "== Assignments with empty reason:\nassignmentIdToReject\tassignmentIdToRejectComment"); for (String assignmentId : assignmentsWithEmptyReason) { System.out.println( assignmentId + "\t\"Dear worker, you did not fill the required field with a reason.\""); } } }
From source file:kindleclippings.word.QuizletSync.java
public static void main(String[] args) throws IOException, JSONException, URISyntaxException, InterruptedException, BackingStoreException, BadLocationException { JFileChooser fc = new JFileChooser(); fc.setFileFilter(new FileNameExtensionFilter("Word documents", "doc", "rtf", "txt")); fc.setMultiSelectionEnabled(true);//from w ww.j av a2 s . co m int result = fc.showOpenDialog(null); if (result != JFileChooser.APPROVE_OPTION) { return; } File[] clf = fc.getSelectedFiles(); if (clf == null || clf.length == 0) return; ProgressMonitor progress = new ProgressMonitor(null, "QuizletSync", "loading notes files", 0, 100); progress.setMillisToPopup(0); progress.setMillisToDecideToPopup(0); progress.setProgress(0); try { progress.setNote("checking Quizlet account"); progress.setProgress(5); Preferences prefs = kindleclippings.quizlet.QuizletSync.getPrefs(); QuizletAPI api = new QuizletAPI(prefs.get("access_token", null)); Collection<TermSet> sets = null; try { progress.setNote("checking Quizlet library"); progress.setProgress(10); sets = api.getSets(prefs.get("user_id", null)); } catch (IOException e) { if (e.toString().contains("401")) { // Not Authorized => Token has been revoked kindleclippings.quizlet.QuizletSync.clearPrefs(); prefs = kindleclippings.quizlet.QuizletSync.getPrefs(); api = new QuizletAPI(prefs.get("access_token", null)); sets = api.getSets(prefs.get("user_id", null)); } else { throw e; } } progress.setProgress(15); progress.setMaximum(15 + clf.length * 10); progress.setNote("uploading new notes"); int pro = 15; int addedSets = 0; int updatedTerms = 0; int updatedSets = 0; for (File f : clf) { progress.setProgress(pro); List<Clipping> clippings = readClippingsFile(f); if (clippings == null) { pro += 10; continue; } if (clippings.isEmpty()) { pro += 10; continue; } if (clippings.size() < 2) { pro += 10; continue; } String book = clippings.get(0).getBook(); progress.setNote(book); TermSet termSet = null; String x = book.toLowerCase().replaceAll("\\W", ""); for (TermSet t : sets) { if (t.getTitle().toLowerCase().replaceAll("\\W", "").equals(x)) { termSet = t; break; } } if (termSet == null) { addSet(api, book, clippings); addedSets++; pro += 10; continue; } // compare against existing terms boolean hasUpdated = false; for (Clipping cl : clippings) { if (!kindleclippings.quizlet.QuizletSync.checkExistingTerm(cl, termSet)) { kindleclippings.quizlet.QuizletSync.addTerm(api, termSet, cl); updatedTerms++; hasUpdated = true; } } pro += 10; if (hasUpdated) updatedSets++; } if (updatedTerms == 0 && addedSets == 0) { JOptionPane.showMessageDialog(null, "Done.\nNo new data was uploaded", "QuizletSync", JOptionPane.OK_OPTION); } else { if (addedSets > 0) { JOptionPane.showMessageDialog(null, String.format("Done.\nCreated %d new sets and added %d cards to %d existing sets", addedSets, updatedSets, updatedTerms), "QuizletSync", JOptionPane.OK_OPTION); } else { JOptionPane.showMessageDialog(null, String.format("Done.\nAdded %d cards to %d existing sets", updatedTerms, updatedSets), "QuizletSync", JOptionPane.OK_OPTION); } } } finally { progress.close(); } System.exit(0); }
From source file:com.topsem.common.io.excel.ExportExcel.java
/** * //www . j a va2 s . co m */ public static void main(String[] args) throws Throwable { List<String> headerList = Lists.newArrayList(); for (int i = 1; i <= 10; i++) { headerList.add("" + i); } List<String> dataRowList = Lists.newArrayList(); for (int i = 1; i <= headerList.size(); i++) { dataRowList.add("?" + i); } List<List<String>> dataList = Lists.newArrayList(); for (int i = 1; i <= 100; i++) { dataList.add(dataRowList); } ExportExcel ee = new ExportExcel("", headerList); for (int i = 0; i < dataList.size(); i++) { Row row = ee.addRow(); for (int j = 0; j < dataList.get(i).size(); j++) { ee.addCell(row, j, dataList.get(i).get(j)); } } ee.writeFile("target/export.xlsx"); ee.dispose(); log.debug("Export success."); }
From source file:edu.osu.ling.pep.Pep.java
/** * Invokes Pep from the command line./*from w ww . ja v a 2 s .c o m*/ * <p> * The main work this method does, apart from tokenizing the arguments and * input tokens, is to load and parse the XML grammar file (as specified by * <code>-g</code> or <code>--grammar</code>). If any of the arguments * <code>-g</code>, <code>--grammar</code>, <code>-s</code>, * <code>--seed</code>, <code>-o</code>, <code>--option</code>, occur with * no argument following, this method prints an error notifying the user. * * @param args * The expected arguments are as follows, and can occur in any * particular order: * <ul> * <li><code>-g|--grammar <grammar file></code></li> <li> * <code>-s|--seed <seed category></code></li> <li><code> * -v|--verbose {verbosity level}</code></li> <li><code> * -o|--option <OPTION_NAME=value></code></li> <li><code> * -h|--help (prints usage information)</code></li> <li><code> * <token1 ... token<em>n</em>></code> (or <code>-</code> * for standard input)</li> * </ul> * <code>OPTION_NAME</code> must be the name of one of the * recognized {@link ParserOption options}. If <code>-h</code> or * <code>--help</code> occur anywhere in the arguments, usage * information is printed and no parsing takes place. */ @SuppressWarnings("static-access") public static final void main(final String[] args) { try { final Options opts = new Options(); opts.addOption(OptionBuilder.withLongOpt("grammar").withDescription("the grammar to use").hasArg() .isRequired().withArgName("grammar file").create('g')); opts.addOption(OptionBuilder.withLongOpt("seed").withDescription("the seed category to parse for") .hasArg().isRequired().withArgName("seed category").create('s')); opts.addOption(OptionBuilder.withLongOpt("verbose").withDescription("0-3").hasOptionalArg() .withArgName("verbosity level").create('v')); opts.addOption(OptionBuilder.withLongOpt("option").withDescription("sets parser options") .withArgName("OPTION=value").hasArgs(2).withValueSeparator() .withDescription("use value for given property").create("o")); opts.addOption(OptionBuilder.withLongOpt("help").withDescription("prints this message").create('h')); final CommandLineParser parser = new GnuParser(); try { final CommandLine line = parser.parse(opts, args); if (line.hasOption('h')) { Pep.printHelp(opts); } else { final int v = Integer.parseInt(line.getOptionValue('v', Integer.toString(Pep.V_PARSE))); if (v < 0) { throw new PepException("verbosity < 0: " + v); } Pep.verbosity = v; final Map<ParserOption, Boolean> options = new EnumMap<ParserOption, Boolean>( ParserOption.class); final Properties props = line.getOptionProperties("o"); for (final Object key : props.keySet()) { try { options.put(ParserOption.valueOf(key.toString()), Boolean.valueOf(props.get(key).toString())); } catch (final IllegalArgumentException iae) { Pep.printError("no option named " + key.toString()); Pep.printHelp(opts); return; } } final Pep pep = new Pep(options); // final Grammar grammar = // new GrammarParser(Pep.findGrammar(line // .getOptionValue('g'))).t.parse(); final List<?> ts = line.getArgList(); List<String> tokens = null; if (ts.isEmpty() || ts.get(0).equals("-")) { tokens = Pep.readTokens(new Scanner(System.in)); } else { tokens = new ArrayList<String>(ts.size()); for (final Object t : ts) { tokens.add(t.toString()); } } pep.lastParseStart = System.currentTimeMillis(); // try { // pep.parse(grammar, tokens, // new Category(line.getOptionValue('s'))); // } catch (final PepException ignore) { // // ignore here, we're listening // } } } catch (final ParseException pe) { Pep.printError("command-line syntax problem: " + pe.getMessage()); Pep.printHelp(opts); } } catch (final PepException pe) { final Throwable cause = pe.getCause(); Pep.printError((cause == null) ? pe : cause); } catch (final RuntimeException re) { Pep.printError(re); } }
From source file:eu.fbk.dh.tint.tokenizer.ItalianTokenizer.java
public static void main(String argv[]) throws IOException { ItalianTokenizer tokenizer = new ItalianTokenizer(); // byte[] file = Files.readAllBytes((new File("/Users/alessio/Desktop/milano.txt")).toPath()); // String text = new String(file); String text = "Clinton in testa nei sondaggi dopo lassoluzione dellFbi sulluso di un server di posta privato quando era Segretario di stato."; // text = "``Determinato, pronto a fare tutto il necessario per mantenere la stabilit dei prezzi.''" // + " Ma anche allarmato per come le conseguenze del referendum britannico minacciano leconomia e i mercati europei." // + " Sono nato nel 200 S.p.A." // + " Il mio indirizzo e-mail alessio@apnetwork.it." // + " Il blog http://www.ziorufus.it e mi piace molto."; // text = "Questo un test per una sigla qualsiasi tipo a.B.C. che non ha senso."; // text = "Milano (/milano/ ascolta[?info], in milanese: Milan[4], /mil?/[5]) una citt italiana di 1 346 153 abitanti[2], capoluogo dell'omonima citt metropolitana e della regione Lombardia, secondo comune italiano per numero di abitanti, tredicesimo comune dell'Unione europea e diciannovesimo del continente e, con l'agglomerato urbano, quarta area metropolitana pi popolata d'Europa dopo Londra, Madrid e Parigi[6].\n" // + "\n" // + "Fondata dagli Insubri all'inizio del VI secolo a.C.[7], fu conquistata dai Romani nel 222 a.C."; // System.out.println(text); long time = System.currentTimeMillis(); List<List<CoreLabel>> sentences = tokenizer.parse(text); time = System.currentTimeMillis() - time; for (int i = 0; i < Math.min(10, sentences.size()); i++) { List<CoreLabel> sentence = sentences.get(i); for (CoreLabel token : sentence) { System.out.println(token.word() + " -- " + token.originalText() + " -- " + token.beginPosition()); }//from ww w . java2 s . c om System.out.println(); } int sentenceSize = sentences.size(); int lastTokenIndex = sentences.get(sentenceSize - 1).get(sentences.get(sentenceSize - 1).size() - 1) .index(); System.out.println("Length: " + text.length()); System.out.println("Time: " + time); System.out.println("Sentences: " + sentenceSize); System.out.println("Tokens: " + lastTokenIndex); }
From source file:de.tudarmstadt.ukp.experiments.argumentation.convincingness.sampling.Step5bAgreementMeasures.java
@SuppressWarnings("unchecked") public static void main(String[] args) throws Exception { String inputDir = args[0];// ww w.j a va2 s. c o m // all annotations List<AnnotatedArgumentPair> allArgumentPairs = new ArrayList<>(); Collection<File> files = IOHelper.listXmlFiles(new File(inputDir)); for (File file : files) { allArgumentPairs.addAll((List<AnnotatedArgumentPair>) XStreamTools.getXStream().fromXML(file)); } // for collecting the rank of n-th best worker per HIT SortedMap<Integer, DescriptiveStatistics> nThWorkerOnHITRank = new TreeMap<>(); // confusion matrix wrt. gold data for each n-th best worker on HIT SortedMap<Integer, ConfusionMatrix> nThWorkerOnHITConfusionMatrix = new TreeMap<>(); // initialize maps for (int i = 0; i < TOP_K_VOTES; i++) { nThWorkerOnHITRank.put(i, new DescriptiveStatistics()); nThWorkerOnHITConfusionMatrix.put(i, new ConfusionMatrix()); } for (AnnotatedArgumentPair argumentPair : allArgumentPairs) { // sort turker rank and their vote SortedMap<Integer, String> rankAndVote = new TreeMap<>(); System.out.println(argumentPair.mTurkAssignments.size()); for (AnnotatedArgumentPair.MTurkAssignment assignment : argumentPair.mTurkAssignments) { rankAndVote.put(assignment.getTurkRank(), assignment.getValue()); } String goldLabel = argumentPair.getGoldLabel(); System.out.println(rankAndVote); // top K workers for the HIT List<String> topKVotes = new ArrayList<>(rankAndVote.values()).subList(0, TOP_K_VOTES); // rank of top K workers List<Integer> topKRanks = new ArrayList<>(rankAndVote.keySet()).subList(0, TOP_K_VOTES); System.out.println("Top K votes: " + topKVotes); System.out.println("Top K ranks: " + topKRanks); // extract only category (a1, a2, or equal) List<String> topKVotesOnlyCategory = new ArrayList<>(); for (String vote : topKVotes) { String category = vote.split("_")[2]; topKVotesOnlyCategory.add(category); } System.out.println("Top " + TOP_K_VOTES + " workers' decisions: " + topKVotesOnlyCategory); if (goldLabel == null) { System.out.println("No gold label estimate for " + argumentPair.getId()); } else { // update statistics for (int i = 0; i < TOP_K_VOTES; i++) { nThWorkerOnHITConfusionMatrix.get(i).increaseValue(goldLabel, topKVotesOnlyCategory.get(i)); // rank is +1 (we don't start ranking from zero) nThWorkerOnHITRank.get(i).addValue(topKRanks.get(i) + 1); } } } for (int i = 0; i < TOP_K_VOTES; i++) { System.out.println("n-th worker : " + (i + 1) + " -----------"); System.out.println(nThWorkerOnHITConfusionMatrix.get(i).printNiceResults()); System.out.println(nThWorkerOnHITConfusionMatrix.get(i)); System.out.println("Average rank: " + nThWorkerOnHITRank.get(i).getMean() + ", stddev " + nThWorkerOnHITRank.get(i).getStandardDeviation()); } }
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step5LinguisticPreprocessing.java
public static void main(String[] args) throws Exception { // input dir - list of xml query containers // step4-boiler-plate/ File inputDir = new File(args[0]); // output dir File outputDir = new File(args[1]); if (!outputDir.exists()) { outputDir.mkdirs();// www. j av a2s. c om } // iterate over query containers for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) { QueryResultContainer queryResultContainer = QueryResultContainer .fromXML(FileUtils.readFileToString(f, "utf-8")); for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) { // System.out.println(rankedResults.plainText); if (rankedResults.plainText != null) { String[] lines = StringUtils.split(rankedResults.plainText, "\n"); // collecting all cleaned lines List<String> cleanLines = new ArrayList<>(lines.length); // collecting line tags List<String> lineTags = new ArrayList<>(lines.length); for (String line : lines) { // get the tag String tag = null; Matcher m = OPENING_TAG_PATTERN.matcher(line); if (m.find()) { tag = m.group(1); } if (tag == null) { throw new IllegalArgumentException("No html tag found for line:\n" + line); } // replace the tag at the beginning and the end String noTagText = line.replaceAll("^<\\S+>", "").replaceAll("</\\S+>$", ""); // do some html cleaning noTagText = noTagText.replaceAll(" ", " "); noTagText = noTagText.trim(); // add to the output if (!noTagText.isEmpty()) { cleanLines.add(noTagText); lineTags.add(tag); } } if (cleanLines.isEmpty()) { // the document is empty System.err.println("Document " + rankedResults.clueWebID + " in query " + queryResultContainer.qID + " is empty"); } else { // now join them back to paragraphs String text = StringUtils.join(cleanLines, "\n"); // create JCas JCas jCas = JCasFactory.createJCas(); jCas.setDocumentText(text); jCas.setDocumentLanguage("en"); // annotate WebParagraph SimplePipeline.runPipeline(jCas, AnalysisEngineFactory.createEngineDescription(WebParagraphAnnotator.class)); // fill the original tag information List<WebParagraph> webParagraphs = new ArrayList<>( JCasUtil.select(jCas, WebParagraph.class)); // they must be the same size as original ones if (webParagraphs.size() != lineTags.size()) { throw new IllegalStateException( "Different size of annotated paragraphs and original lines"); } for (int i = 0; i < webParagraphs.size(); i++) { WebParagraph p = webParagraphs.get(i); // get tag String tag = lineTags.get(i); p.setOriginalHtmlTag(tag); } SimplePipeline.runPipeline(jCas, AnalysisEngineFactory.createEngineDescription(StanfordSegmenter.class, // only on existing WebParagraph annotations StanfordSegmenter.PARAM_ZONE_TYPES, WebParagraph.class.getCanonicalName())); // now convert to XMI ByteArrayOutputStream byteOutputStream = new ByteArrayOutputStream(); XmiCasSerializer.serialize(jCas.getCas(), byteOutputStream); // encode to base64 String encoded = new BASE64Encoder().encode(byteOutputStream.toByteArray()); rankedResults.originalXmi = encoded; } } } // and save the query to output dir File outputFile = new File(outputDir, queryResultContainer.qID + ".xml"); FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8"); System.out.println("Finished " + outputFile); } }