List of usage examples for javax.xml.stream XMLEventReader hasNext
@Override public boolean hasNext();
From source file:sapience.injectors.stax.inject.StringBasedStaxStreamInjector.java
/** * If the reference is more then a simple attribute, we have to add new XML (subtree) to the stream. We transform * the reference into an InputStream and invoke another SAX parsing process for it. But the parsed events are added * to the main XMLEventWriter. //from w w w . jav a 2s .c o m * * @param w * @param string * @throws XMLStreamException * @throws XMLStreamException */ private void createEventsForElement(XMLEventWriter w, Reference ref) throws XMLStreamException { XMLEventReader r = null; try { StringBuilder target = new StringBuilder(ref.getTarget().toString()); NamespaceContext c = w.getNamespaceContext(); // process namespaces //processNamespace(target, w.getNamespaceContext()); ByteArrayInputStream bais = new ByteArrayInputStream(target.toString().getBytes()); this.inFac.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); r = this.inFac.createXMLEventReader(bais); // start a new line while (r.hasNext()) { XMLEvent e = r.nextEvent(); switch (e.getEventType()) { case XMLEvent.START_DOCUMENT: break; case XMLEvent.END_DOCUMENT: break; default: w.add(e); break; } } } finally { ; if (r != null) r.close(); } }
From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { try {/*w w w . j a v a2 s.co m*/ // wir iterieren schn ber die page-Eintrge. Darin gibt es dann title, timestamp, <contributor> => <username> und text. den text mssen // wir noch bereinigen. dazu nehmen wir eine Vorverarbeitung mit bliki - dazu mssen wir aber selbst nochmal den String vorbereiten und // nachbereinigen. Leider. WikipediaDumpParserConfig wikipediaDumpParserConfig = context.get(WikipediaDumpParserConfig.class); if (wikipediaDumpParserConfig == null) { Logger.getLogger(WikipediaDumpParser.class.getName()) .info("No wikipedia parser config found. Will take the default one."); wikipediaDumpParserConfig = new WikipediaDumpParserConfig(); } TikaInputStream tikaStream = TikaInputStream.get(stream); File fWikipediaDumpFile4Stream = tikaStream.getFile(); MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueHashMap<String, String>(); if (wikipediaDumpParserConfig.determinePageRedirects) hsPageTitle2Redirects = getPageTitle2Redirects(new FileInputStream(fWikipediaDumpFile4Stream)); HashSet<String> hsRedirectPageTitles = new HashSet<String>(hsPageTitle2Redirects.values()); String strCleanedText = ""; String strBaseURL = null; XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); XMLEventReader xmlEventReader = xmlInputFactory .createXMLEventReader(new FileInputStream(fWikipediaDumpFile4Stream), "Utf-8"); while (xmlEventReader.hasNext()) { XMLEvent xmlEvent = xmlEventReader.nextEvent(); if (xmlEvent.isEndElement() && xmlEvent.asEndElement().getName().getLocalPart().equals("page")) { if (metadata.size() == 0) continue; // den mimetype wollen wir auch noch in den Metadaten haben metadata.add(Metadata.CONTENT_TYPE, "application/wikipedia+xml"); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("p"); xhtml.characters(strCleanedText.toCharArray(), 0, strCleanedText.length()); xhtml.endElement("p"); xhtml.endDocument(); } if (!xmlEvent.isStartElement()) continue; // ##### die siteinfo if (strBaseURL == null && xmlEvent.asStartElement().getName().getLocalPart().equals("base")) { // http://de.wikipedia.org/wiki/Wikipedia:Hauptseite =>http://de.wikipedia.org/wiki/ strBaseURL = readNextCharEventsText(xmlEventReader); strBaseURL = strBaseURL.substring(0, strBaseURL.lastIndexOf("/") + 1); } // ##### die page if (xmlEvent.asStartElement().getName().getLocalPart().equals("page")) { for (String strKey : metadata.names()) metadata.remove(strKey); } // ##### der Title if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) { // wir merken uns immer den aktuellen Titel String strCurrentTitle = readNextCharEventsText(xmlEventReader); if (strCurrentTitle.equalsIgnoreCase("DuckDuckGo")) { int fasd = 8; } if (strCurrentTitle.toLowerCase().contains("duck") && strCurrentTitle.toLowerCase().contains("go")) { int is = 666; } // wenn der Titel eine redirect-Page ist, dann tragen wir die ganze Page aus der EventQueue aus, springen an das endPage, und // haben somit diese Seite ignoriert. Ferner ignorieren wir auch spezielle wikipedia-Seiten String strSmallTitle = strCurrentTitle.trim().toLowerCase(); if (hsRedirectPageTitles.contains(strCurrentTitle) || hsRedirectPageTitles.contains(strSmallTitle) || hsRedirectPageTitles.contains(strCurrentTitle.trim()) || strSmallTitle.startsWith("category:") || strSmallTitle.startsWith("kategorie:") || strSmallTitle.startsWith("vorlage:") || strSmallTitle.startsWith("template:") || strSmallTitle.startsWith("hilfe:") || strSmallTitle.startsWith("help:") || strSmallTitle.startsWith("wikipedia:") || strSmallTitle.startsWith("portal:") || strSmallTitle.startsWith("mediawiki:")) { while (true) { XMLEvent nextXmlEvent = xmlEventReader.nextEvent(); if (nextXmlEvent.isEndElement() && nextXmlEvent.asEndElement().getName().getLocalPart().equals("page")) break; } } else { metadata.add(Metadata.TITLE, strCurrentTitle); metadata.add(Metadata.SOURCE, strBaseURL + strCurrentTitle); for (String strRedirect : hsPageTitle2Redirects.get(strCurrentTitle)) { // wir ignorieren Titel, die sich lediglich durch gro/kleinschreibung unterscheiden if (!StringUtils.containsIgnoreCase(strRedirect, metadata.getValues(Metadata.TITLE))) metadata.add(Metadata.TITLE, strRedirect); } } continue; } // ##### der text if (xmlEvent.asStartElement().getName().getLocalPart().equals("text")) { String strText = readNextCharEventsText(xmlEventReader); if (wikipediaDumpParserConfig.parseLinksAndCategories) parseLinksAndCategories(strText, strBaseURL, metadata, handler); if (wikipediaDumpParserConfig.parseInfoBoxes) parseInfoBox(strText, metadata, handler); if (wikipediaDumpParserConfig.parseGeoCoordinates) parseGeoCoordinates(strText, metadata); // aufgrund einiger Defizite in dem verwendeten cleaner mssen wir hier leider noch zu-und nacharbeiten strText = strText.replaceAll("==\n", "==\n\n"); strText = strText.replaceAll("\n==", "\n\n=="); strCleanedText = m_wikiModel.render(new PlainTextConverter(), strText); strCleanedText = strCleanedText.replaceAll("\\{\\{", " "); strCleanedText = strCleanedText.replaceAll("\\}\\}", " "); strCleanedText = StringEscapeUtils.unescapeHtml4(strCleanedText); continue; } // ##### der timestamp if (xmlEvent.asStartElement().getName().getLocalPart().equals("timestamp")) { String strTimestamp = readNextCharEventsText(xmlEventReader); metadata.add(Metadata.MODIFIED, strTimestamp); continue; } // ##### der username if (xmlEvent.asStartElement().getName().getLocalPart().equals("username")) { String strUsername = readNextCharEventsText(xmlEventReader); metadata.add(Metadata.CREATOR, strUsername); continue; } } } catch (Exception e) { Logger.getLogger(WikipediaDumpParser.class.getName()).log(Level.SEVERE, "Error", e); } }
From source file:com.msopentech.odatajclient.testservice.utils.XMLUtilities.java
private int countFeedElements(final InputStream is, final String elementName) throws XMLStreamException { final XMLEventReader reader = getEventReader(is); int count = 0; while (reader.hasNext()) { final XMLEvent event = reader.nextEvent(); if (event.getEventType() == XMLStreamConstants.START_ELEMENT && elementName.equals(event.asStartElement().getName().getLocalPart())) { count++;/*from ww w. ja v a2 s . co m*/ } } reader.close(); return count; }
From source file:edu.jhu.hlt.concrete.ingesters.webposts.WebPostIngester.java
@Override public Communication fromCharacterBasedFile(final Path path) throws IngestException { if (!Files.exists(path)) throw new IngestException("No file at: " + path.toString()); AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory(); AnalyticUUIDGenerator g = f.create(); Communication c = new Communication(); c.setUuid(g.next());//from ww w. j ava 2 s. co m c.setType(this.getKind()); c.setMetadata(TooledMetadataConverter.convert(this)); try { ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(path); c.setId(ef.getName().split("\\.")[0]); } catch (NoSuchFileException | NotFileException e) { // might throw if path is a directory. throw new IngestException(path.toString() + " is not a file, or is a directory."); } String content; try (InputStream is = Files.newInputStream(path); BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);) { content = IOUtils.toString(bin, StandardCharsets.UTF_8); c.setText(content); } catch (IOException e) { throw new IngestException(e); } try (InputStream is = Files.newInputStream(path); BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8); BufferedReader reader = new BufferedReader(new InputStreamReader(bin, StandardCharsets.UTF_8));) { XMLEventReader rdr = null; try { rdr = inF.createXMLEventReader(reader); // Below method moves the reader // to the headline end element. Section headline = this.handleBeginning(rdr, content, c); headline.setUuid(g.next()); c.addToSectionList(headline); TextSpan sts = headline.getTextSpan(); LOGGER.debug("headline text: {}", c.getText().substring(sts.getStart(), sts.getEnding())); int sectNumber = 1; int subSect = 0; int currOff = -1; // Big amounts of characters. while (rdr.hasNext()) { XMLEvent nextEvent = rdr.nextEvent(); currOff = nextEvent.getLocation().getCharacterOffset(); // First: see if document is going to end. // If yes: exit. if (nextEvent.isEndDocument()) break; // region // enables ingestion of quotes inside a usenet webpost. // by Tongfei Chen if (nextEvent.isStartElement() && nextEvent.asStartElement().getName().equals(QName.valueOf("QUOTE"))) { Attribute attrQuote = nextEvent.asStartElement() .getAttributeByName(QName.valueOf("PREVIOUSPOST")); String quote = StringEscapeUtils.escapeXml(attrQuote.getValue()); int location = attrQuote.getLocation().getCharacterOffset() + "<QUOTE PREVIOUSPOST=\"".length(); Section quoteSection = new Section(g.next(), "quote") .setTextSpan(new TextSpan(location, location + quote.length())); c.addToSectionList(quoteSection); } // endregion // Check if start element. if (nextEvent.isCharacters()) { Characters chars = nextEvent.asCharacters(); if (!chars.isWhiteSpace()) { String fpContent = chars.getData(); LOGGER.debug("Character offset: {}", currOff); LOGGER.debug("Character based data: {}", fpContent); SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(fpContent); final int tsb = currOff + pads.getKey(); final int tse = currOff + fpContent.replace("\"", """).replace("<", "<") .replace(">", ">").length() - (pads.getValue()); // MAINTAIN CORRECT TEXT SPAN // CANNOT USE StringEscapeUtils.escapeXml because it will escape "'", which // is not escaped in the data // @tongfei LOGGER.debug("Section text: {}", content.substring(tsb, tse)); TextSpan ts = new TextSpan(tsb, tse); String sk; if (subSect == 0) sk = "poster"; else if (subSect == 1) sk = "postdate"; else sk = "post"; Section s = new Section(); s.setKind(sk); s.setTextSpan(ts); s.setUuid(g.next()); List<Integer> intList = new ArrayList<>(); intList.add(sectNumber); intList.add(subSect); s.setNumberList(intList); c.addToSectionList(s); subSect++; } } else if (nextEvent.isEndElement()) { EndElement ee = nextEvent.asEndElement(); currOff = ee.getLocation().getCharacterOffset(); QName name = ee.getName(); String localName = name.getLocalPart(); LOGGER.debug("Hit end element: {}", localName); if (localName.equalsIgnoreCase(POST_LOCAL_NAME)) { LOGGER.debug("Switching to new post."); sectNumber++; subSect = 0; } else if (localName.equalsIgnoreCase(TEXT_LOCAL_NAME)) { // done with document. break; } } } return c; } catch (XMLStreamException | ConcreteException | StringIndexOutOfBoundsException | ClassCastException x) { throw new IngestException(x); } finally { if (rdr != null) try { rdr.close(); } catch (XMLStreamException e) { // not likely. LOGGER.info("Error closing XMLReader.", e); } } } catch (IOException e) { throw new IngestException(e); } }
From source file:com.evolveum.polygon.connector.hcm.DocumentProcessing.java
public Map<String, Object> parseXMLData(HcmConnectorConfiguration conf, ResultsHandler handler, Map<String, Object> schemaAttributeMap, Filter query) { XMLInputFactory factory = XMLInputFactory.newInstance(); try {/* w w w . j av a 2s. c o m*/ String uidAttributeName = conf.getUidAttribute(); String primariId = conf.getPrimaryId(); String startName = ""; String value = null; StringBuilder assignmentXMLBuilder = null; List<String> builderList = new ArrayList<String>(); Integer nOfIterations = 0; Boolean isSubjectToQuery = false; Boolean isAssigment = false; Boolean evaluateAttr = true; Boolean specificAttributeQuery = false; XMLEventReader eventReader = factory.createXMLEventReader(new FileReader(conf.getFilePath())); List<String> dictionary = populateDictionary(FIRSTFLAG); if (!attrsToGet.isEmpty()) { attrsToGet.add(uidAttributeName); attrsToGet.add(primariId); specificAttributeQuery = true; evaluateAttr = false; LOGGER.ok("The uid and primary id were added to the queried attribute list"); schemaAttributeMap = modifySchemaAttributeMap(schemaAttributeMap); } while (eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); Integer code = event.getEventType(); if (code == XMLStreamConstants.START_ELEMENT) { StartElement startElement = event.asStartElement(); startName = startElement.getName().getLocalPart(); if (!evaluateAttr && attrsToGet.contains(startName)) { evaluateAttr = true; } if (!elementIsEmployeeData) { if (startName.equals(EMPLOYEES)) { if (dictionary.contains(nOfIterations.toString())) { LOGGER.ok("The defined number of iterations has been hit: {0}", nOfIterations.toString()); break; } else { startName = ""; elementIsEmployeeData = true; nOfIterations++; } } } else if (evaluateAttr) { if (!isAssigment) { if (!ASSIGNMENTTAG.equals(startName)) { } else { assignmentXMLBuilder = new StringBuilder(); isAssigment = true; } } else { builderList = processAssignment(startName, null, START, builderList); } if (multiValuedAttributesList.contains(startName)) { elementIsMultiValued = true; } } } else if (elementIsEmployeeData) { if (code == XMLStreamConstants.CHARACTERS && evaluateAttr) { Characters characters = event.asCharacters(); if (!characters.isWhiteSpace()) { StringBuilder valueBuilder; if (value != null) { valueBuilder = new StringBuilder(value).append("") .append(characters.getData().toString()); } else { valueBuilder = new StringBuilder(characters.getData().toString()); } value = valueBuilder.toString(); // value = StringEscapeUtils.escapeXml10(value); // LOGGER.info("The attribute value for: {0} is // {1}", startName, value); } } else if (code == XMLStreamConstants.END_ELEMENT) { EndElement endElement = event.asEndElement(); String endName = endElement.getName().getLocalPart(); isSubjectToQuery = checkFilter(endName, value, query, uidAttributeName); if (!isSubjectToQuery) { attributeMap.clear(); elementIsEmployeeData = false; value = null; endName = EMPLOYEES; } if (endName.equals(EMPLOYEES)) { attributeMap = handleEmployeeData(attributeMap, schemaAttributeMap, handler, uidAttributeName, primariId); elementIsEmployeeData = false; } else if (evaluateAttr) { if (endName.equals(startName)) { if (value != null) { if (!isAssigment) { if (!elementIsMultiValued) { attributeMap.put(startName, value); } else { multiValuedAttributeBuffer.put(startName, value); } } else { value = StringEscapeUtils.escapeXml10(value); builderList = processAssignment(endName, value, VALUE, builderList); builderList = processAssignment(endName, null, END, builderList); } // LOGGER.info("Attribute name: {0} and the // Attribute value: {1}", endName, value); value = null; } } else { if (endName.equals(ASSIGNMENTTAG)) { builderList = processAssignment(endName, null, CLOSE, builderList); // if (assigmentIsActive) { for (String records : builderList) { assignmentXMLBuilder.append(records); } attributeMap.put(ASSIGNMENTTAG, assignmentXMLBuilder.toString()); // } else { // } builderList = new ArrayList<String>(); // assigmentIsActive = false; isAssigment = false; } else if (multiValuedAttributesList.contains(endName)) { processMultiValuedAttributes(multiValuedAttributeBuffer); } } } if (specificAttributeQuery && evaluateAttr) { evaluateAttr = false; } } } else if (code == XMLStreamConstants.END_DOCUMENT) { handleBufferedData(uidAttributeName, primariId, handler); } } } catch (FileNotFoundException e) { StringBuilder errorBuilder = new StringBuilder("File not found at the specified path.") .append(e.getLocalizedMessage()); LOGGER.error("File not found at the specified path: {0}", e); throw new ConnectorIOException(errorBuilder.toString()); } catch (XMLStreamException e) { LOGGER.error("Unexpected processing error while parsing the .xml document : {0}", e); StringBuilder errorBuilder = new StringBuilder( "Unexpected processing error while parsing the .xml document. ") .append(e.getLocalizedMessage()); throw new ConnectorIOException(errorBuilder.toString()); } return attributeMap; }
From source file:com.msopentech.odatajclient.testservice.utils.XMLUtilities.java
private void addAtomElement(final InputStream content, final XMLEventWriter writer) throws Exception { final XMLEventReader reader = getEventReader(content); final XMLEventFactory eventFactory = XMLEventFactory.newInstance(); XMLEvent newLine = eventFactory.createSpace("\n"); try {/*from w w w . ja v a2 s .com*/ writer.add(newLine); while (reader.hasNext()) { final XMLEvent event = reader.nextEvent(); if (event.getEventType() != XMLStreamConstants.START_DOCUMENT && event.getEventType() != XMLStreamConstants.END_DOCUMENT && event.getEventType() != XMLStreamConstants.COMMENT) { writer.add(event); } } writer.add(newLine); } finally { reader.close(); IOUtils.closeQuietly(content); } }
From source file:com.logiware.accounting.domain.EdiInvoice.java
private void createEcuLineInvoice(File file) throws Exception { InputStream inputStream = null; XMLEventReader eventReader = null; try {//w ww . j a va2 s.c om XMLInputFactory inputFactory = XMLInputFactory.newInstance(); inputStream = new FileInputStream(file); eventReader = inputFactory.createXMLEventReader(inputStream); while (eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); if (event.isStartElement()) { StartElement startElement = event.asStartElement(); if ("Header".equalsIgnoreCase(startElement.getName().toString())) { isHeader = true; elements.add("Header"); } else if ("Body".equalsIgnoreCase(startElement.getName().toString())) { isBody = true; elements.add("Body"); } else if (isBody && "Information".equalsIgnoreCase(startElement.getName().toString())) { isInformation = true; elements.add("Information"); } else if (isBody && !isInformation && "Details".equalsIgnoreCase(startElement.getName().toString())) { isDetails = true; elements.add("Details"); } else if (isBody && !isInformation && !isDetails && "Summary".equalsIgnoreCase(startElement.getName().toString())) { isSummary = true; elements.add("Summary"); } else if (null == elementType) { setElementType(startElement); } else if (null != elementType && null == characterType) { setCharacterType(startElement); } } else if (event.isCharacters()) { setValue(event.asCharacters()); } else if (event.isEndElement()) { EndElement endElement = event.asEndElement(); if (null != characterType && null != elementType) { removeCharacterType(); } else if (null != elementType) { removeElementType(endElement); } else if (isSummary && "Summary".equalsIgnoreCase(endElement.getName().toString())) { isSummary = false; } else if (isDetails && "Details".equalsIgnoreCase(endElement.getName().toString())) { isDetails = false; } else if (isBody && "Information".equalsIgnoreCase(endElement.getName().toString())) { isInformation = false; } else if ("Body".equalsIgnoreCase(endElement.getName().toString())) { isBody = false; } else if ("Header".equalsIgnoreCase(endElement.getName().toString())) { isHeader = false; } } } this.company = Company.ECU_LINE; status = new EdiInvoiceDAO().getStatus(vendorNumber, invoiceNumber); if (!elements.contains("Header")) { throw new AccountingException("Bad File. <Header> element missing"); } else if (!elements.contains("Body")) { throw new AccountingException("Bad File. <Body> missing"); } else if (!elements.contains("Information")) { throw new AccountingException("Bad File. <Information> element under <Body> missing"); } else if (!elements.contains("Details")) { throw new AccountingException("Bad File. <Details> element under <Body> missing"); } else if (!elements.contains("Summary")) { throw new AccountingException("Bad File. <Summary> element under <Body> missing"); } else if (!elements.contains("Applicationreference")) { throw new AccountingException("Bad File. <Applicationreference> element under <Header> missing"); } else if (!elements.contains("Reference")) { throw new AccountingException("Bad File. <Reference> element under <Header> missing"); } else if (!elements.contains("Sender")) { throw new AccountingException("Bad File. <Sender> element under <Header> missing"); } else if (!elements.contains("Code")) { throw new AccountingException("Bad File. <Code> element under <Sender> of <Header> missing"); } else if (!elements.contains("Invoice")) { throw new AccountingException( "Bad File. <Invoice> element under <Information> element of <Body> missing"); } else if (!elements.contains("RelatedReferences")) { throw new AccountingException( "Bad File. <RelatedReferences> element under <Information> element of <Body> missing"); } else if (!elements.contains("BY")) { throw new AccountingException( "Bad File. <Parties Qualifier=\"BY\"> under <Information> element of <Body> missing"); } else if (!elements.contains("SU")) { throw new AccountingException( "Bad File. <Parties Qualifier=\"SU\"> under <Information> element of <Body> missing"); } else if (!elements.contains("PaymentTerms")) { throw new AccountingException( "Bad File. <PaymentTerms> element under <Information> element of <Body> missing"); } else if (!elements.contains("ShipmentInformation")) { throw new AccountingException( "Bad File. <ShipmentInformation> element under <Information> element of <Body> missing"); } else if (!elements.contains("Detail")) { throw new AccountingException( "Bad File. <Detail> element under <Details> element of <Body> missing"); } else if (!elements.contains("TotalMonetaryAmount")) { throw new AccountingException( "Bad File. <TotalMonetaryAmount> element under <Summary> element of <Body> missing"); } else if (!elements.contains("TotalMonetaryAmountGroupByVAT")) { throw new AccountingException( "Bad File. <TotalMonetaryAmountGroupByVAT> element under <Summary> element of <Body> missing"); } } catch (Exception e) { throw e; } finally { if (null != eventReader) { eventReader.close(); } if (null != inputStream) { inputStream.close(); } } }
From source file:ca.uhn.fhir.parser.XmlParser.java
private <T> T doXmlLoop(XMLEventReader streamReader, ParserState<T> parserState) { ourLog.trace("Entering XML parsing loop with state: {}", parserState); try {/*from ww w . j av a 2s . co m*/ List<String> heldComments = new ArrayList<String>(1); while (streamReader.hasNext()) { XMLEvent nextEvent = streamReader.nextEvent(); try { switch (nextEvent.getEventType()) { case XMLStreamConstants.START_ELEMENT: { StartElement elem = nextEvent.asStartElement(); String namespaceURI = elem.getName().getNamespaceURI(); if ("extension".equals(elem.getName().getLocalPart())) { Attribute urlAttr = elem.getAttributeByName(new QName("url")); String url; if (urlAttr == null || isBlank(urlAttr.getValue())) { getErrorHandler().missingRequiredElement(new ParseLocation("extension"), "url"); url = null; } else { url = urlAttr.getValue(); } parserState.enteringNewElementExtension(elem, url, false); } else if ("modifierExtension".equals(elem.getName().getLocalPart())) { Attribute urlAttr = elem.getAttributeByName(new QName("url")); String url; if (urlAttr == null || isBlank(urlAttr.getValue())) { getErrorHandler().missingRequiredElement(new ParseLocation("modifierExtension"), "url"); url = null; } else { url = urlAttr.getValue(); } parserState.enteringNewElementExtension(elem, url, true); } else { String elementName = elem.getName().getLocalPart(); parserState.enteringNewElement(namespaceURI, elementName); } if (!heldComments.isEmpty()) { for (String next : heldComments) { parserState.commentPre(next); } heldComments.clear(); } @SuppressWarnings("unchecked") Iterator<Attribute> attributes = elem.getAttributes(); for (Iterator<Attribute> iter = attributes; iter.hasNext();) { Attribute next = iter.next(); parserState.attributeValue(next.getName().getLocalPart(), next.getValue()); } break; } case XMLStreamConstants.END_DOCUMENT: case XMLStreamConstants.END_ELEMENT: { if (!heldComments.isEmpty()) { for (String next : heldComments) { parserState.commentPost(next); } heldComments.clear(); } parserState.endingElement(); // if (parserState.isComplete()) { // return parserState.getObject(); // } break; } case XMLStreamConstants.CHARACTERS: { parserState.string(nextEvent.asCharacters().getData()); break; } case XMLStreamConstants.COMMENT: { Comment comment = (Comment) nextEvent; String commentText = comment.getText(); heldComments.add(commentText); break; } } parserState.xmlEvent(nextEvent); } catch (DataFormatException e) { throw new DataFormatException("DataFormatException at [" + nextEvent.getLocation().toString() + "]: " + e.getMessage(), e); } } return parserState.getObject(); } catch (XMLStreamException e) { throw new DataFormatException(e); } }
From source file:edu.unc.lib.dl.util.TripleStoreQueryServiceMulgaraImpl.java
/** * @param query/* w w w .j ava2 s . c o m*/ * an ITQL command * @return the message returned by Mulgara * @throws RemoteException * for communication failure */ public String storeCommand(String query) { String result = null; String response = this.sendTQL(query); if (response != null) { XMLInputFactory factory = XMLInputFactory.newInstance(); factory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE); try (StringReader sr = new StringReader(response)) { XMLEventReader r = factory.createXMLEventReader(sr); boolean inMessage = false; StringBuffer message = new StringBuffer(); while (r.hasNext()) { XMLEvent e = r.nextEvent(); if (e.isStartElement()) { StartElement s = e.asStartElement(); if ("message".equals(s.getName().getLocalPart())) { inMessage = true; } } else if (e.isEndElement()) { EndElement end = e.asEndElement(); if ("message".equals(end.getName().getLocalPart())) { inMessage = false; } } else if (inMessage && e.isCharacters()) { message.append(e.asCharacters().getData()); } } r.close(); result = message.toString(); } catch (XMLStreamException e) { e.printStackTrace(); } } return result; }
From source file:edu.jhu.hlt.concrete.ingesters.bolt.BoltForumPostIngester.java
@Override public Communication fromCharacterBasedFile(final Path path) throws IngestException { if (!Files.exists(path)) throw new IngestException("No file at: " + path.toString()); AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory(); AnalyticUUIDGenerator gen = f.create(); Communication c = new Communication(); c.setUuid(gen.next());//from w w w .jav a 2 s . c o m c.setType(this.getKind()); c.setMetadata(TooledMetadataConverter.convert(this)); try { ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(path); c.setId(ef.getName().split("\\.")[0]); } catch (NoSuchFileException | NotFileException e) { // might throw if path is a directory. throw new IngestException(path.toString() + " is not a file, or is a directory."); } String content; try (InputStream is = Files.newInputStream(path); BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);) { content = IOUtils.toString(bin, StandardCharsets.UTF_8); c.setText(content); } catch (IOException e) { throw new IngestException(e); } try (InputStream is = Files.newInputStream(path); BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8); BufferedReader reader = new BufferedReader(new InputStreamReader(bin, StandardCharsets.UTF_8));) { XMLEventReader rdr = null; try { rdr = inF.createXMLEventReader(reader); // Below method moves the reader // to the first post element. Section headline = handleHeadline(rdr, content); headline.setUuid(gen.next()); c.addToSectionList(headline); int start = headline.getTextSpan().getStart(); int ending = headline.getTextSpan().getEnding(); if (ending < start) ending = start; // @tongfei: handle empty headlines String htxt = c.getText().substring(start, ending); LOGGER.debug("headline text: {}", htxt); // Section indices. int sectNumber = 1; int subSect = 0; // Move iterator to post start element. this.iterateToPosts(rdr); // Offset pointer. int currOff = -1; SectionFactory sf = new SectionFactory(gen); // First post element. while (rdr.hasNext()) { XMLEvent nextEvent = rdr.nextEvent(); currOff = nextEvent.getLocation().getCharacterOffset(); if (currOff > 0) { int currOffPlus = currOff + 20; int currOffLess = currOff - 20; LOGGER.debug("Offset: {}", currOff); if (currOffPlus < content.length()) LOGGER.debug("Surrounding text: {}", content.substring(currOffLess, currOffPlus)); } // First: see if document is going to end. // If yes: exit. if (nextEvent.isEndDocument()) break; // XMLEvent peeker = rdr.peek(); // Check if start element. if (nextEvent.isStartElement()) { StartElement se = nextEvent.asStartElement(); QName name = se.getName(); final String localName = name.getLocalPart(); LOGGER.debug("Hit start element: {}", localName); //region // Add sections for authors and datetimes for each bolt post // by Tongfei Chen Attribute attrAuthor = se.getAttributeByName(QName.valueOf("author")); Attribute attrDateTime = se.getAttributeByName(QName.valueOf("datetime")); if (attrAuthor != null && attrDateTime != null) { int loc = attrAuthor.getLocation().getCharacterOffset(); int sectAuthorBeginningOffset = loc + "<post author=\"".length(); Section sectAuthor = sf.fromTextSpan(new TextSpan(sectAuthorBeginningOffset, sectAuthorBeginningOffset + attrAuthor.getValue().length()), "author"); c.addToSectionList(sectAuthor); int sectDateTimeBeginningOffset = sectAuthorBeginningOffset + attrAuthor.getValue().length() + " datetime=".length(); Section sectDateTime = sf.fromTextSpan( new TextSpan(sectDateTimeBeginningOffset, sectDateTimeBeginningOffset + attrDateTime.getValue().length()), "datetime"); c.addToSectionList(sectDateTime); } //endregion // Move past quotes, images, and links. if (localName.equals(QUOTE_LOCAL_NAME)) { this.handleQuote(rdr); } else if (localName.equals(IMG_LOCAL_NAME)) { this.handleImg(rdr); } else if (localName.equals(LINK_LOCAL_NAME)) { this.handleLink(rdr); } // not a start element } else if (nextEvent.isCharacters()) { Characters chars = nextEvent.asCharacters(); int coff = chars.getLocation().getCharacterOffset(); if (!chars.isWhiteSpace()) { // content to be captured String fpContent = chars.getData(); LOGGER.debug("Character offset: {}", coff); LOGGER.debug("Character based data: {}", fpContent); // LOGGER.debug("Character data via offset diff: {}", content.substring(coff - fpContent.length(), coff)); SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(fpContent); final int tsb = currOff + pads.getKey(); final int tse = currOff + fpContent.length() - pads.getValue(); final String subs = content.substring(tsb, tse); if (subs.replaceAll("\\p{Zs}", "").replaceAll("\\n", "").isEmpty()) { LOGGER.info("Found empty section: skipping."); continue; } LOGGER.debug("Section text: {}", subs); TextSpan ts = new TextSpan(tsb, tse); Section s = sf.fromTextSpan(ts, "post"); List<Integer> intList = new ArrayList<>(); intList.add(sectNumber); intList.add(subSect); s.setNumberList(intList); c.addToSectionList(s); subSect++; } } else if (nextEvent.isEndElement()) { EndElement ee = nextEvent.asEndElement(); currOff = ee.getLocation().getCharacterOffset(); QName name = ee.getName(); String localName = name.getLocalPart(); LOGGER.debug("Hit end element: {}", localName); if (localName.equalsIgnoreCase(POST_LOCAL_NAME)) { sectNumber++; subSect = 0; } } } return c; } catch (XMLStreamException | ConcreteException | StringIndexOutOfBoundsException x) { throw new IngestException(x); } finally { if (rdr != null) try { rdr.close(); } catch (XMLStreamException e) { // not likely. LOGGER.info("Error closing XMLReader.", e); } } } catch (IOException e) { throw new IngestException(e); } }