List of usage examples for javax.xml.stream XMLEventReader nextEvent
public XMLEvent nextEvent() throws XMLStreamException;
From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java
public MultiValueHashMap<String, String> getPageTitle2Redirects(InputStream sWikipediaDump) throws FileNotFoundException, XMLStreamException { // <text xml:space="preserve">#REDIRECT [[Autopoiesis]]</text> // <text xml:space="preserve">#REDIRECT:[[Hans Leo Haler]]</text> // <text xml:space="preserve">#redirect [[Weier Hai]]</text> // #weiterleitung // <page> // <title>Autopoiesis</title> Logger.getLogger(WikipediaDumpParser.class.getName()).info("will collect redirects from wikipedia dump..."); MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueBalancedTreeMap<String, String>(); String strCurrentTitle = ""; XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(sWikipediaDump, "Utf-8"); int iTitlesRead = 0; while (xmlEventReader.hasNext()) { XMLEvent xmlEvent = xmlEventReader.nextEvent(); if (!xmlEvent.isStartElement()) continue; // wenn wir einen Title haben, dann merken wir uns den, falls wir ihn brauchen if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) { strCurrentTitle = readNextCharEventsText(xmlEventReader); iTitlesRead++;/* ww w . j av a 2 s . co m*/ if (iTitlesRead % 200000 == 0) Logger.getLogger(WikipediaDumpParser.class.getName()) .info("read doc #" + StringUtils.beautifyNumber(iTitlesRead)); continue; } if (!xmlEvent.asStartElement().getName().getLocalPart().equals("text")) continue; // jetzt haben wir ein text-tag. Wir schauen, ob jetzt ein redirect kommt // entweder kommt ein charEvent oder ein EndEvent. Leere Texte gibts wohl auch XMLEvent nextEvent = xmlEventReader.peek(); if (!nextEvent.isCharacters()) continue; String strCharEventData = readNextCharEventsText(xmlEventReader); if (strCharEventData == null) continue; strCharEventData = strCharEventData.trim(); boolean bRedirect = false; if (strCharEventData.length() >= 9 && strCharEventData.substring(0, 9).equalsIgnoreCase("#redirect")) bRedirect = true; if (!bRedirect && strCharEventData.length() >= 8 && strCharEventData.substring(0, 8).equalsIgnoreCase("redirect") && !strCharEventData.contains("\n")) bRedirect = true; if (!bRedirect && strCharEventData.length() >= 14 && strCharEventData.substring(0, 14).equalsIgnoreCase("#weiterleitung")) bRedirect = true; if (!bRedirect && strCharEventData.length() >= 13 && strCharEventData.substring(0, 13).equalsIgnoreCase("weiterleitung") && !strCharEventData.contains("\n")) bRedirect = true; if (!bRedirect) continue; // wir haben einen redirect - der wird in unsere Datenstruktur eingetragen int iStart = strCharEventData.indexOf("[["); int iEnd = strCharEventData.indexOf("]]"); if (iStart < 0 || iEnd < 0) continue; if (iEnd <= iStart) continue; if ((iStart + 2) > strCharEventData.length() || iEnd > strCharEventData.length()) continue; String strRedirectTarget = strCharEventData.substring(iStart + 2, iEnd).trim(); hsPageTitle2Redirects.add(strRedirectTarget, strCurrentTitle); // if("Venceslav Konstantinov".equalsIgnoreCase(strCurrentTitle) || "Venceslav Konstantinov".equalsIgnoreCase(strRedirectTarget)) // System.out.println("redirect found: (" + hsPageTitle2Redirects.keySize() + ") " + strCurrentTitle + " => '" + strRedirectTarget + "'"); } Logger.getLogger(WikipediaDumpParser.class.getName()) .info("Redirects found: " + StringUtils.beautifyNumber(hsPageTitle2Redirects.valueSize())); return hsPageTitle2Redirects; }
From source file:json_to_xml_1.java
public int execute(String args[]) throws ProgramTerminationException { this.getInfoMessages().clear(); if (args.length < 2) { throw constructTermination("messageArgumentsMissing", null, getI10nString("messageArgumentsMissingUsage") + "\n\tjson_to_xml_1 " + getI10nString("messageParameterList") + "\n"); }/*w w w . j a v a2 s. c o m*/ File resultInfoFile = new File(args[1]); try { resultInfoFile = resultInfoFile.getCanonicalFile(); } catch (SecurityException ex) { throw constructTermination("messageResultInfoFileCantGetCanonicalPath", ex, null, resultInfoFile.getAbsolutePath()); } catch (IOException ex) { throw constructTermination("messageResultInfoFileCantGetCanonicalPath", ex, null, resultInfoFile.getAbsolutePath()); } if (resultInfoFile.exists() == true) { if (resultInfoFile.isFile() == true) { if (resultInfoFile.canWrite() != true) { throw constructTermination("messageResultInfoFileIsntWritable", null, null, resultInfoFile.getAbsolutePath()); } } else { throw constructTermination("messageResultInfoPathIsntAFile", null, null, resultInfoFile.getAbsolutePath()); } } json_to_xml_1.resultInfoFile = resultInfoFile; File jobFile = new File(args[0]); try { jobFile = jobFile.getCanonicalFile(); } catch (SecurityException ex) { throw constructTermination("messageJobFileCantGetCanonicalPath", ex, null, jobFile.getAbsolutePath()); } catch (IOException ex) { throw constructTermination("messageJobFileCantGetCanonicalPath", ex, null, jobFile.getAbsolutePath()); } if (jobFile.exists() != true) { throw constructTermination("messageJobFileDoesntExist", null, null, jobFile.getAbsolutePath()); } if (jobFile.isFile() != true) { throw constructTermination("messageJobPathIsntAFile", null, null, jobFile.getAbsolutePath()); } if (jobFile.canRead() != true) { throw constructTermination("messageJobFileIsntReadable", null, null, jobFile.getAbsolutePath()); } System.out.println("json_to_xml_1: " + getI10nStringFormatted("messageCallDetails", jobFile.getAbsolutePath(), resultInfoFile.getAbsolutePath())); File inputFile = null; File outputFile = null; try { XMLInputFactory inputFactory = XMLInputFactory.newInstance(); InputStream in = new FileInputStream(jobFile); XMLEventReader eventReader = inputFactory.createXMLEventReader(in); while (eventReader.hasNext() == true) { XMLEvent event = eventReader.nextEvent(); if (event.isStartElement() == true) { String tagName = event.asStartElement().getName().getLocalPart(); if (tagName.equals("json-input-file") == true) { StartElement inputFileElement = event.asStartElement(); Attribute pathAttribute = inputFileElement.getAttributeByName(new QName("path")); if (pathAttribute == null) { throw constructTermination("messageJobFileEntryIsMissingAnAttribute", null, null, jobFile.getAbsolutePath(), tagName, "path"); } String inputFilePath = pathAttribute.getValue(); if (inputFilePath.isEmpty() == true) { throw constructTermination("messageJobFileAttributeValueIsEmpty", null, null, jobFile.getAbsolutePath(), tagName, "path"); } inputFile = new File(inputFilePath); if (inputFile.isAbsolute() != true) { inputFile = new File( jobFile.getAbsoluteFile().getParent() + File.separator + inputFilePath); } try { inputFile = inputFile.getCanonicalFile(); } catch (SecurityException ex) { throw constructTermination("messageInputFileCantGetCanonicalPath", ex, null, new File(inputFilePath).getAbsolutePath(), jobFile.getAbsolutePath()); } catch (IOException ex) { throw constructTermination("messageInputFileCantGetCanonicalPath", ex, null, new File(inputFilePath).getAbsolutePath(), jobFile.getAbsolutePath()); } if (inputFile.exists() != true) { throw constructTermination("messageInputFileDoesntExist", null, null, inputFile.getAbsolutePath(), jobFile.getAbsolutePath()); } if (inputFile.isFile() != true) { throw constructTermination("messageInputPathIsntAFile", null, null, inputFile.getAbsolutePath(), jobFile.getAbsolutePath()); } if (inputFile.canRead() != true) { throw constructTermination("messageInputFileIsntReadable", null, null, inputFile.getAbsolutePath(), jobFile.getAbsolutePath()); } } else if (tagName.equals("xml-output-file") == true) { StartElement outputFileElement = event.asStartElement(); Attribute pathAttribute = outputFileElement.getAttributeByName(new QName("path")); if (pathAttribute == null) { throw constructTermination("messageJobFileEntryIsMissingAnAttribute", null, null, jobFile.getAbsolutePath(), tagName, "path"); } String outputFilePath = pathAttribute.getValue(); if (outputFilePath.isEmpty() == true) { throw constructTermination("messageJobFileAttributeValueIsEmpty", null, null, jobFile.getAbsolutePath(), tagName, "path"); } outputFile = new File(outputFilePath); if (outputFile.isAbsolute() != true) { outputFile = new File( jobFile.getAbsoluteFile().getParent() + File.separator + outputFilePath); } try { outputFile = outputFile.getCanonicalFile(); } catch (SecurityException ex) { throw constructTermination("messageOutputFileCantGetCanonicalPath", ex, null, new File(outputFilePath).getAbsolutePath(), jobFile.getAbsolutePath()); } catch (IOException ex) { throw constructTermination("messageOutputFileCantGetCanonicalPath", ex, null, new File(outputFilePath).getAbsolutePath(), jobFile.getAbsolutePath()); } if (outputFile.exists() == true) { if (outputFile.isFile() == true) { if (outputFile.canWrite() != true) { throw constructTermination("messageOutputFileIsntWritable", null, null, outputFile.getAbsolutePath()); } } else { throw constructTermination("messageOutputPathIsntAFile", null, null, outputFile.getAbsolutePath()); } } } } } } catch (XMLStreamException ex) { throw constructTermination("messageJobFileErrorWhileReading", ex, null, jobFile.getAbsolutePath()); } catch (SecurityException ex) { throw constructTermination("messageJobFileErrorWhileReading", ex, null, jobFile.getAbsolutePath()); } catch (IOException ex) { throw constructTermination("messageJobFileErrorWhileReading", ex, null, jobFile.getAbsolutePath()); } if (inputFile == null) { throw constructTermination("messageJobFileNoInputFile", null, null, jobFile.getAbsolutePath()); } if (outputFile == null) { throw constructTermination("messageJobFileNoOutputFile", null, null, jobFile.getAbsolutePath()); } StringBuilder stringBuilder = new StringBuilder(); try { JSONObject json = new JSONObject(new JSONTokener(new BufferedReader(new FileReader(inputFile)))); stringBuilder.append(XML.toString(json)); } catch (Exception ex) { throw constructTermination("messageConversionError", ex, null, inputFile.getAbsolutePath()); } try { BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8")); writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); writer.write( "<!-- This file was created by json_to_xml_1, which is free software licensed under the GNU Affero General Public License 3 or any later version (see https://github.com/publishing-systems/digital_publishing_workflow_tools/ and http://www.publishing-systems.org). -->\n"); writer.write(stringBuilder.toString()); writer.flush(); writer.close(); } catch (FileNotFoundException ex) { throw constructTermination("messageOutputFileWritingError", ex, null, outputFile.getAbsolutePath()); } catch (UnsupportedEncodingException ex) { throw constructTermination("messageOutputFileWritingError", ex, null, outputFile.getAbsolutePath()); } catch (IOException ex) { throw constructTermination("messageOutputFileWritingError", ex, null, outputFile.getAbsolutePath()); } return 0; }
From source file:sapience.injectors.stax.inject.StringBasedStaxStreamInjector.java
/** * If the reference is more then a simple attribute, we have to add new XML (subtree) to the stream. We transform * the reference into an InputStream and invoke another SAX parsing process for it. But the parsed events are added * to the main XMLEventWriter. /*w w w . ja v a 2 s. c o m*/ * * @param w * @param string * @throws XMLStreamException * @throws XMLStreamException */ private void createEventsForElement(XMLEventWriter w, Reference ref) throws XMLStreamException { XMLEventReader r = null; try { StringBuilder target = new StringBuilder(ref.getTarget().toString()); NamespaceContext c = w.getNamespaceContext(); // process namespaces //processNamespace(target, w.getNamespaceContext()); ByteArrayInputStream bais = new ByteArrayInputStream(target.toString().getBytes()); this.inFac.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); r = this.inFac.createXMLEventReader(bais); // start a new line while (r.hasNext()) { XMLEvent e = r.nextEvent(); switch (e.getEventType()) { case XMLEvent.START_DOCUMENT: break; case XMLEvent.END_DOCUMENT: break; default: w.add(e); break; } } } finally { ; if (r != null) r.close(); } }
From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { try {/* w w w . j av a2s.c om*/ // wir iterieren schn ber die page-Eintrge. Darin gibt es dann title, timestamp, <contributor> => <username> und text. den text mssen // wir noch bereinigen. dazu nehmen wir eine Vorverarbeitung mit bliki - dazu mssen wir aber selbst nochmal den String vorbereiten und // nachbereinigen. Leider. WikipediaDumpParserConfig wikipediaDumpParserConfig = context.get(WikipediaDumpParserConfig.class); if (wikipediaDumpParserConfig == null) { Logger.getLogger(WikipediaDumpParser.class.getName()) .info("No wikipedia parser config found. Will take the default one."); wikipediaDumpParserConfig = new WikipediaDumpParserConfig(); } TikaInputStream tikaStream = TikaInputStream.get(stream); File fWikipediaDumpFile4Stream = tikaStream.getFile(); MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueHashMap<String, String>(); if (wikipediaDumpParserConfig.determinePageRedirects) hsPageTitle2Redirects = getPageTitle2Redirects(new FileInputStream(fWikipediaDumpFile4Stream)); HashSet<String> hsRedirectPageTitles = new HashSet<String>(hsPageTitle2Redirects.values()); String strCleanedText = ""; String strBaseURL = null; XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); XMLEventReader xmlEventReader = xmlInputFactory .createXMLEventReader(new FileInputStream(fWikipediaDumpFile4Stream), "Utf-8"); while (xmlEventReader.hasNext()) { XMLEvent xmlEvent = xmlEventReader.nextEvent(); if (xmlEvent.isEndElement() && xmlEvent.asEndElement().getName().getLocalPart().equals("page")) { if (metadata.size() == 0) continue; // den mimetype wollen wir auch noch in den Metadaten haben metadata.add(Metadata.CONTENT_TYPE, "application/wikipedia+xml"); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("p"); xhtml.characters(strCleanedText.toCharArray(), 0, strCleanedText.length()); xhtml.endElement("p"); xhtml.endDocument(); } if (!xmlEvent.isStartElement()) continue; // ##### die siteinfo if (strBaseURL == null && xmlEvent.asStartElement().getName().getLocalPart().equals("base")) { // http://de.wikipedia.org/wiki/Wikipedia:Hauptseite =>http://de.wikipedia.org/wiki/ strBaseURL = readNextCharEventsText(xmlEventReader); strBaseURL = strBaseURL.substring(0, strBaseURL.lastIndexOf("/") + 1); } // ##### die page if (xmlEvent.asStartElement().getName().getLocalPart().equals("page")) { for (String strKey : metadata.names()) metadata.remove(strKey); } // ##### der Title if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) { // wir merken uns immer den aktuellen Titel String strCurrentTitle = readNextCharEventsText(xmlEventReader); if (strCurrentTitle.equalsIgnoreCase("DuckDuckGo")) { int fasd = 8; } if (strCurrentTitle.toLowerCase().contains("duck") && strCurrentTitle.toLowerCase().contains("go")) { int is = 666; } // wenn der Titel eine redirect-Page ist, dann tragen wir die ganze Page aus der EventQueue aus, springen an das endPage, und // haben somit diese Seite ignoriert. Ferner ignorieren wir auch spezielle wikipedia-Seiten String strSmallTitle = strCurrentTitle.trim().toLowerCase(); if (hsRedirectPageTitles.contains(strCurrentTitle) || hsRedirectPageTitles.contains(strSmallTitle) || hsRedirectPageTitles.contains(strCurrentTitle.trim()) || strSmallTitle.startsWith("category:") || strSmallTitle.startsWith("kategorie:") || strSmallTitle.startsWith("vorlage:") || strSmallTitle.startsWith("template:") || strSmallTitle.startsWith("hilfe:") || strSmallTitle.startsWith("help:") || strSmallTitle.startsWith("wikipedia:") || strSmallTitle.startsWith("portal:") || strSmallTitle.startsWith("mediawiki:")) { while (true) { XMLEvent nextXmlEvent = xmlEventReader.nextEvent(); if (nextXmlEvent.isEndElement() && nextXmlEvent.asEndElement().getName().getLocalPart().equals("page")) break; } } else { metadata.add(Metadata.TITLE, strCurrentTitle); metadata.add(Metadata.SOURCE, strBaseURL + strCurrentTitle); for (String strRedirect : hsPageTitle2Redirects.get(strCurrentTitle)) { // wir ignorieren Titel, die sich lediglich durch gro/kleinschreibung unterscheiden if (!StringUtils.containsIgnoreCase(strRedirect, metadata.getValues(Metadata.TITLE))) metadata.add(Metadata.TITLE, strRedirect); } } continue; } // ##### der text if (xmlEvent.asStartElement().getName().getLocalPart().equals("text")) { String strText = readNextCharEventsText(xmlEventReader); if (wikipediaDumpParserConfig.parseLinksAndCategories) parseLinksAndCategories(strText, strBaseURL, metadata, handler); if (wikipediaDumpParserConfig.parseInfoBoxes) parseInfoBox(strText, metadata, handler); if (wikipediaDumpParserConfig.parseGeoCoordinates) parseGeoCoordinates(strText, metadata); // aufgrund einiger Defizite in dem verwendeten cleaner mssen wir hier leider noch zu-und nacharbeiten strText = strText.replaceAll("==\n", "==\n\n"); strText = strText.replaceAll("\n==", "\n\n=="); strCleanedText = m_wikiModel.render(new PlainTextConverter(), strText); strCleanedText = strCleanedText.replaceAll("\\{\\{", " "); strCleanedText = strCleanedText.replaceAll("\\}\\}", " "); strCleanedText = StringEscapeUtils.unescapeHtml4(strCleanedText); continue; } // ##### der timestamp if (xmlEvent.asStartElement().getName().getLocalPart().equals("timestamp")) { String strTimestamp = readNextCharEventsText(xmlEventReader); metadata.add(Metadata.MODIFIED, strTimestamp); continue; } // ##### der username if (xmlEvent.asStartElement().getName().getLocalPart().equals("username")) { String strUsername = readNextCharEventsText(xmlEventReader); metadata.add(Metadata.CREATOR, strUsername); continue; } } } catch (Exception e) { Logger.getLogger(WikipediaDumpParser.class.getName()).log(Level.SEVERE, "Error", e); } }
From source file:ca.phon.session.io.xml.v12.XMLSessionReader_v12.java
@Override public boolean canRead(File file) throws IOException { // open file and make sure the first // element is 'session' with the correct version boolean canRead = false; // use StAX to read only first element // create StAX reader XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader reader = null; try (FileInputStream source = new FileInputStream(file)) { //BufferedReader in = new BufferedReader(new InputStreamReader(source, "UTF-8")); XMLEventReader xmlReader = factory.createXMLEventReader(source, "UTF-8"); reader = factory.createFilteredReader(xmlReader, new XMLWhitespaceFilter()); XMLEvent evt;//from www . ja v a 2 s. c o m while (!(evt = reader.nextEvent()).isStartElement()) ; canRead = evt.asStartElement().getName().getLocalPart().equals("session") && evt.asStartElement().getAttributeByName(new QName("version")).getValue().equals("PB1.2"); } catch (XMLStreamException e) { throw new IOException(e); } return canRead; }
From source file:com.msopentech.odatajclient.testservice.utils.XMLUtilities.java
private int countFeedElements(final InputStream is, final String elementName) throws XMLStreamException { final XMLEventReader reader = getEventReader(is); int count = 0; while (reader.hasNext()) { final XMLEvent event = reader.nextEvent(); if (event.getEventType() == XMLStreamConstants.START_ELEMENT && elementName.equals(event.asStartElement().getName().getLocalPart())) { count++;//from www .j av a2 s .co m } } reader.close(); return count; }
From source file:edu.jhu.hlt.concrete.ingesters.webposts.WebPostIngester.java
@Override public Communication fromCharacterBasedFile(final Path path) throws IngestException { if (!Files.exists(path)) throw new IngestException("No file at: " + path.toString()); AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory(); AnalyticUUIDGenerator g = f.create(); Communication c = new Communication(); c.setUuid(g.next());//from w w w . ja v a 2 s .co m c.setType(this.getKind()); c.setMetadata(TooledMetadataConverter.convert(this)); try { ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(path); c.setId(ef.getName().split("\\.")[0]); } catch (NoSuchFileException | NotFileException e) { // might throw if path is a directory. throw new IngestException(path.toString() + " is not a file, or is a directory."); } String content; try (InputStream is = Files.newInputStream(path); BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);) { content = IOUtils.toString(bin, StandardCharsets.UTF_8); c.setText(content); } catch (IOException e) { throw new IngestException(e); } try (InputStream is = Files.newInputStream(path); BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8); BufferedReader reader = new BufferedReader(new InputStreamReader(bin, StandardCharsets.UTF_8));) { XMLEventReader rdr = null; try { rdr = inF.createXMLEventReader(reader); // Below method moves the reader // to the headline end element. Section headline = this.handleBeginning(rdr, content, c); headline.setUuid(g.next()); c.addToSectionList(headline); TextSpan sts = headline.getTextSpan(); LOGGER.debug("headline text: {}", c.getText().substring(sts.getStart(), sts.getEnding())); int sectNumber = 1; int subSect = 0; int currOff = -1; // Big amounts of characters. while (rdr.hasNext()) { XMLEvent nextEvent = rdr.nextEvent(); currOff = nextEvent.getLocation().getCharacterOffset(); // First: see if document is going to end. // If yes: exit. if (nextEvent.isEndDocument()) break; // region // enables ingestion of quotes inside a usenet webpost. // by Tongfei Chen if (nextEvent.isStartElement() && nextEvent.asStartElement().getName().equals(QName.valueOf("QUOTE"))) { Attribute attrQuote = nextEvent.asStartElement() .getAttributeByName(QName.valueOf("PREVIOUSPOST")); String quote = StringEscapeUtils.escapeXml(attrQuote.getValue()); int location = attrQuote.getLocation().getCharacterOffset() + "<QUOTE PREVIOUSPOST=\"".length(); Section quoteSection = new Section(g.next(), "quote") .setTextSpan(new TextSpan(location, location + quote.length())); c.addToSectionList(quoteSection); } // endregion // Check if start element. if (nextEvent.isCharacters()) { Characters chars = nextEvent.asCharacters(); if (!chars.isWhiteSpace()) { String fpContent = chars.getData(); LOGGER.debug("Character offset: {}", currOff); LOGGER.debug("Character based data: {}", fpContent); SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(fpContent); final int tsb = currOff + pads.getKey(); final int tse = currOff + fpContent.replace("\"", """).replace("<", "<") .replace(">", ">").length() - (pads.getValue()); // MAINTAIN CORRECT TEXT SPAN // CANNOT USE StringEscapeUtils.escapeXml because it will escape "'", which // is not escaped in the data // @tongfei LOGGER.debug("Section text: {}", content.substring(tsb, tse)); TextSpan ts = new TextSpan(tsb, tse); String sk; if (subSect == 0) sk = "poster"; else if (subSect == 1) sk = "postdate"; else sk = "post"; Section s = new Section(); s.setKind(sk); s.setTextSpan(ts); s.setUuid(g.next()); List<Integer> intList = new ArrayList<>(); intList.add(sectNumber); intList.add(subSect); s.setNumberList(intList); c.addToSectionList(s); subSect++; } } else if (nextEvent.isEndElement()) { EndElement ee = nextEvent.asEndElement(); currOff = ee.getLocation().getCharacterOffset(); QName name = ee.getName(); String localName = name.getLocalPart(); LOGGER.debug("Hit end element: {}", localName); if (localName.equalsIgnoreCase(POST_LOCAL_NAME)) { LOGGER.debug("Switching to new post."); sectNumber++; subSect = 0; } else if (localName.equalsIgnoreCase(TEXT_LOCAL_NAME)) { // done with document. break; } } } return c; } catch (XMLStreamException | ConcreteException | StringIndexOutOfBoundsException | ClassCastException x) { throw new IngestException(x); } finally { if (rdr != null) try { rdr.close(); } catch (XMLStreamException e) { // not likely. LOGGER.info("Error closing XMLReader.", e); } } } catch (IOException e) { throw new IngestException(e); } }
From source file:com.evolveum.polygon.connector.hcm.DocumentProcessing.java
public Map<String, Object> parseXMLData(HcmConnectorConfiguration conf, ResultsHandler handler, Map<String, Object> schemaAttributeMap, Filter query) { XMLInputFactory factory = XMLInputFactory.newInstance(); try {/*from w w w . j a va 2s . c om*/ String uidAttributeName = conf.getUidAttribute(); String primariId = conf.getPrimaryId(); String startName = ""; String value = null; StringBuilder assignmentXMLBuilder = null; List<String> builderList = new ArrayList<String>(); Integer nOfIterations = 0; Boolean isSubjectToQuery = false; Boolean isAssigment = false; Boolean evaluateAttr = true; Boolean specificAttributeQuery = false; XMLEventReader eventReader = factory.createXMLEventReader(new FileReader(conf.getFilePath())); List<String> dictionary = populateDictionary(FIRSTFLAG); if (!attrsToGet.isEmpty()) { attrsToGet.add(uidAttributeName); attrsToGet.add(primariId); specificAttributeQuery = true; evaluateAttr = false; LOGGER.ok("The uid and primary id were added to the queried attribute list"); schemaAttributeMap = modifySchemaAttributeMap(schemaAttributeMap); } while (eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); Integer code = event.getEventType(); if (code == XMLStreamConstants.START_ELEMENT) { StartElement startElement = event.asStartElement(); startName = startElement.getName().getLocalPart(); if (!evaluateAttr && attrsToGet.contains(startName)) { evaluateAttr = true; } if (!elementIsEmployeeData) { if (startName.equals(EMPLOYEES)) { if (dictionary.contains(nOfIterations.toString())) { LOGGER.ok("The defined number of iterations has been hit: {0}", nOfIterations.toString()); break; } else { startName = ""; elementIsEmployeeData = true; nOfIterations++; } } } else if (evaluateAttr) { if (!isAssigment) { if (!ASSIGNMENTTAG.equals(startName)) { } else { assignmentXMLBuilder = new StringBuilder(); isAssigment = true; } } else { builderList = processAssignment(startName, null, START, builderList); } if (multiValuedAttributesList.contains(startName)) { elementIsMultiValued = true; } } } else if (elementIsEmployeeData) { if (code == XMLStreamConstants.CHARACTERS && evaluateAttr) { Characters characters = event.asCharacters(); if (!characters.isWhiteSpace()) { StringBuilder valueBuilder; if (value != null) { valueBuilder = new StringBuilder(value).append("") .append(characters.getData().toString()); } else { valueBuilder = new StringBuilder(characters.getData().toString()); } value = valueBuilder.toString(); // value = StringEscapeUtils.escapeXml10(value); // LOGGER.info("The attribute value for: {0} is // {1}", startName, value); } } else if (code == XMLStreamConstants.END_ELEMENT) { EndElement endElement = event.asEndElement(); String endName = endElement.getName().getLocalPart(); isSubjectToQuery = checkFilter(endName, value, query, uidAttributeName); if (!isSubjectToQuery) { attributeMap.clear(); elementIsEmployeeData = false; value = null; endName = EMPLOYEES; } if (endName.equals(EMPLOYEES)) { attributeMap = handleEmployeeData(attributeMap, schemaAttributeMap, handler, uidAttributeName, primariId); elementIsEmployeeData = false; } else if (evaluateAttr) { if (endName.equals(startName)) { if (value != null) { if (!isAssigment) { if (!elementIsMultiValued) { attributeMap.put(startName, value); } else { multiValuedAttributeBuffer.put(startName, value); } } else { value = StringEscapeUtils.escapeXml10(value); builderList = processAssignment(endName, value, VALUE, builderList); builderList = processAssignment(endName, null, END, builderList); } // LOGGER.info("Attribute name: {0} and the // Attribute value: {1}", endName, value); value = null; } } else { if (endName.equals(ASSIGNMENTTAG)) { builderList = processAssignment(endName, null, CLOSE, builderList); // if (assigmentIsActive) { for (String records : builderList) { assignmentXMLBuilder.append(records); } attributeMap.put(ASSIGNMENTTAG, assignmentXMLBuilder.toString()); // } else { // } builderList = new ArrayList<String>(); // assigmentIsActive = false; isAssigment = false; } else if (multiValuedAttributesList.contains(endName)) { processMultiValuedAttributes(multiValuedAttributeBuffer); } } } if (specificAttributeQuery && evaluateAttr) { evaluateAttr = false; } } } else if (code == XMLStreamConstants.END_DOCUMENT) { handleBufferedData(uidAttributeName, primariId, handler); } } } catch (FileNotFoundException e) { StringBuilder errorBuilder = new StringBuilder("File not found at the specified path.") .append(e.getLocalizedMessage()); LOGGER.error("File not found at the specified path: {0}", e); throw new ConnectorIOException(errorBuilder.toString()); } catch (XMLStreamException e) { LOGGER.error("Unexpected processing error while parsing the .xml document : {0}", e); StringBuilder errorBuilder = new StringBuilder( "Unexpected processing error while parsing the .xml document. ") .append(e.getLocalizedMessage()); throw new ConnectorIOException(errorBuilder.toString()); } return attributeMap; }
From source file:com.msopentech.odatajclient.testservice.utils.XMLUtilities.java
private void addAtomElement(final InputStream content, final XMLEventWriter writer) throws Exception { final XMLEventReader reader = getEventReader(content); final XMLEventFactory eventFactory = XMLEventFactory.newInstance(); XMLEvent newLine = eventFactory.createSpace("\n"); try {/*from ww w . jav a 2 s . c om*/ writer.add(newLine); while (reader.hasNext()) { final XMLEvent event = reader.nextEvent(); if (event.getEventType() != XMLStreamConstants.START_DOCUMENT && event.getEventType() != XMLStreamConstants.END_DOCUMENT && event.getEventType() != XMLStreamConstants.COMMENT) { writer.add(event); } } writer.add(newLine); } finally { reader.close(); IOUtils.closeQuietly(content); } }
From source file:com.logiware.accounting.domain.EdiInvoice.java
private void createEcuLineInvoice(File file) throws Exception { InputStream inputStream = null; XMLEventReader eventReader = null; try {/*from w ww . ja v a2 s.com*/ XMLInputFactory inputFactory = XMLInputFactory.newInstance(); inputStream = new FileInputStream(file); eventReader = inputFactory.createXMLEventReader(inputStream); while (eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); if (event.isStartElement()) { StartElement startElement = event.asStartElement(); if ("Header".equalsIgnoreCase(startElement.getName().toString())) { isHeader = true; elements.add("Header"); } else if ("Body".equalsIgnoreCase(startElement.getName().toString())) { isBody = true; elements.add("Body"); } else if (isBody && "Information".equalsIgnoreCase(startElement.getName().toString())) { isInformation = true; elements.add("Information"); } else if (isBody && !isInformation && "Details".equalsIgnoreCase(startElement.getName().toString())) { isDetails = true; elements.add("Details"); } else if (isBody && !isInformation && !isDetails && "Summary".equalsIgnoreCase(startElement.getName().toString())) { isSummary = true; elements.add("Summary"); } else if (null == elementType) { setElementType(startElement); } else if (null != elementType && null == characterType) { setCharacterType(startElement); } } else if (event.isCharacters()) { setValue(event.asCharacters()); } else if (event.isEndElement()) { EndElement endElement = event.asEndElement(); if (null != characterType && null != elementType) { removeCharacterType(); } else if (null != elementType) { removeElementType(endElement); } else if (isSummary && "Summary".equalsIgnoreCase(endElement.getName().toString())) { isSummary = false; } else if (isDetails && "Details".equalsIgnoreCase(endElement.getName().toString())) { isDetails = false; } else if (isBody && "Information".equalsIgnoreCase(endElement.getName().toString())) { isInformation = false; } else if ("Body".equalsIgnoreCase(endElement.getName().toString())) { isBody = false; } else if ("Header".equalsIgnoreCase(endElement.getName().toString())) { isHeader = false; } } } this.company = Company.ECU_LINE; status = new EdiInvoiceDAO().getStatus(vendorNumber, invoiceNumber); if (!elements.contains("Header")) { throw new AccountingException("Bad File. <Header> element missing"); } else if (!elements.contains("Body")) { throw new AccountingException("Bad File. <Body> missing"); } else if (!elements.contains("Information")) { throw new AccountingException("Bad File. <Information> element under <Body> missing"); } else if (!elements.contains("Details")) { throw new AccountingException("Bad File. <Details> element under <Body> missing"); } else if (!elements.contains("Summary")) { throw new AccountingException("Bad File. <Summary> element under <Body> missing"); } else if (!elements.contains("Applicationreference")) { throw new AccountingException("Bad File. <Applicationreference> element under <Header> missing"); } else if (!elements.contains("Reference")) { throw new AccountingException("Bad File. <Reference> element under <Header> missing"); } else if (!elements.contains("Sender")) { throw new AccountingException("Bad File. <Sender> element under <Header> missing"); } else if (!elements.contains("Code")) { throw new AccountingException("Bad File. <Code> element under <Sender> of <Header> missing"); } else if (!elements.contains("Invoice")) { throw new AccountingException( "Bad File. <Invoice> element under <Information> element of <Body> missing"); } else if (!elements.contains("RelatedReferences")) { throw new AccountingException( "Bad File. <RelatedReferences> element under <Information> element of <Body> missing"); } else if (!elements.contains("BY")) { throw new AccountingException( "Bad File. <Parties Qualifier=\"BY\"> under <Information> element of <Body> missing"); } else if (!elements.contains("SU")) { throw new AccountingException( "Bad File. <Parties Qualifier=\"SU\"> under <Information> element of <Body> missing"); } else if (!elements.contains("PaymentTerms")) { throw new AccountingException( "Bad File. <PaymentTerms> element under <Information> element of <Body> missing"); } else if (!elements.contains("ShipmentInformation")) { throw new AccountingException( "Bad File. <ShipmentInformation> element under <Information> element of <Body> missing"); } else if (!elements.contains("Detail")) { throw new AccountingException( "Bad File. <Detail> element under <Details> element of <Body> missing"); } else if (!elements.contains("TotalMonetaryAmount")) { throw new AccountingException( "Bad File. <TotalMonetaryAmount> element under <Summary> element of <Body> missing"); } else if (!elements.contains("TotalMonetaryAmountGroupByVAT")) { throw new AccountingException( "Bad File. <TotalMonetaryAmountGroupByVAT> element under <Summary> element of <Body> missing"); } } catch (Exception e) { throw e; } finally { if (null != eventReader) { eventReader.close(); } if (null != inputStream) { inputStream.close(); } } }