List of usage examples for javax.xml.stream XMLInputFactory setProperty
public abstract void setProperty(java.lang.String name, Object value) throws java.lang.IllegalArgumentException;
From source file:edu.unc.lib.dl.util.TripleStoreQueryServiceMulgaraImpl.java
/** * @param query//from w w w .j av a 2s . c om * an ITQL command * @return the message returned by Mulgara * @throws RemoteException * for communication failure */ public String storeCommand(String query) { String result = null; String response = this.sendTQL(query); if (response != null) { XMLInputFactory factory = XMLInputFactory.newInstance(); factory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE); try (StringReader sr = new StringReader(response)) { XMLEventReader r = factory.createXMLEventReader(sr); boolean inMessage = false; StringBuffer message = new StringBuffer(); while (r.hasNext()) { XMLEvent e = r.nextEvent(); if (e.isStartElement()) { StartElement s = e.asStartElement(); if ("message".equals(s.getName().getLocalPart())) { inMessage = true; } } else if (e.isEndElement()) { EndElement end = e.asEndElement(); if ("message".equals(end.getName().getLocalPart())) { inMessage = false; } } else if (inMessage && e.isCharacters()) { message.append(e.asCharacters().getData()); } } r.close(); result = message.toString(); } catch (XMLStreamException e) { e.printStackTrace(); } } return result; }
From source file:com.ikanow.infinit.e.harvest.extraction.document.file.FileHarvester.java
private void parse(InfiniteFile f, SourcePojo source) throws MalformedURLException, URISyntaxException { //NOTE: we only ever break out of here because of max docs in standalone mode // (because we don't know how to continue reading) DocumentPojo doc = null;/*w ww . j a v a 2s . c o m*/ //Determine File Extension String fileName = f.getName().toString(); int mid = fileName.lastIndexOf("."); String extension = fileName.substring(mid + 1, fileName.length()); //Checked to save processing time long fileTimestamp = (f.getDate() / 1000) * 1000; // (ensure truncated to seconds, since some operation somewhere hear does this...) Date modDate = new Date(fileTimestamp); //XML Data gets placed into MetaData boolean bIsXml = false; boolean bIsJson = false; boolean bIsLineOriented = false; if ((null != source.getFileConfig()) && (null != source.getFileConfig().type)) { extension = source.getFileConfig().type; } bIsXml = extension.equalsIgnoreCase("xml"); bIsJson = extension.equalsIgnoreCase("json"); bIsLineOriented = extension.endsWith("sv"); if (bIsXml || bIsJson || bIsLineOriented) { int debugMaxDocs = Integer.MAX_VALUE; // by default don't set this, it's only for debug mode if (_context.isStandalone()) { // debug mode debugMaxDocs = maxDocsPerCycle; } //fast check to see if the file has changed before processing (or if it never existed) if (needsUpdated_SourceUrl(modDate, f.getUrlString(), source)) { if (0 != modDate.getTime()) { // if it ==0 then sourceUrl doesn't exist at all, no need to delete // This file already exists - in normal/managed mode will re-create // In streaming mode, simple skip over if (_streaming) { return; } //TESTED DocumentPojo docRepresentingSrcUrl = new DocumentPojo(); docRepresentingSrcUrl.setSourceUrl(f.getUrlString()); docRepresentingSrcUrl.setSourceKey(source.getKey()); docRepresentingSrcUrl.setCommunityId(source.getCommunityIds().iterator().next()); sourceUrlsGettingUpdated.add(docRepresentingSrcUrl.getSourceUrl()); this.docsToRemove.add(docRepresentingSrcUrl); // (can add documents with just source URL, are treated differently in the core libraries) } SourceFileConfigPojo fileSystem = source.getFileConfig(); if ((null == fileSystem) && (bIsXml || bIsJson)) { fileSystem = new SourceFileConfigPojo(); } XmlToMetadataParser xmlParser = null; JsonToMetadataParser jsonParser = null; String urlType = extension; if (bIsXml) { xmlParser = new XmlToMetadataParser(fileSystem.XmlRootLevelValues, fileSystem.XmlIgnoreValues, fileSystem.XmlSourceName, fileSystem.XmlPrimaryKey, fileSystem.XmlAttributePrefix, fileSystem.XmlPreserveCase, debugMaxDocs); } //TESTED else if (bIsJson) { jsonParser = new JsonToMetadataParser(fileSystem.XmlSourceName, fileSystem.XmlRootLevelValues, fileSystem.XmlPrimaryKey, fileSystem.XmlIgnoreValues, debugMaxDocs); } //TESTED List<DocumentPojo> partials = null; try { if (bIsXml) { XMLStreamReader xmlStreamReader = null; XMLInputFactory factory = XMLInputFactory.newInstance(); factory.setProperty(XMLInputFactory.IS_COALESCING, true); factory.setProperty(XMLInputFactory.SUPPORT_DTD, false); try { xmlStreamReader = factory.createXMLStreamReader(f.getInputStream()); partials = xmlParser.parseDocument(xmlStreamReader); long memUsage = xmlParser.getMemUsage(); _memUsage += memUsage; _totalMemUsage.addAndGet(memUsage); } finally { if (null != xmlStreamReader) xmlStreamReader.close(); } } //TESTED else if (bIsJson) { JsonReader jsonReader = null; try { jsonReader = new JsonReader(new InputStreamReader(f.getInputStream(), "UTF-8")); jsonReader.setLenient(true); partials = jsonParser.parseDocument(jsonReader); long memUsage = jsonParser.getMemUsage(); _memUsage += memUsage; _totalMemUsage.addAndGet(memUsage); } finally { if (null != jsonReader) jsonReader.close(); } } //TESTED else if (bIsLineOriented) { // Just generate a document for every line BufferedReader lineReader = null; try { lineReader = new BufferedReader(new InputStreamReader(f.getInputStream(), "UTF-8")); CsvToMetadataParser lineParser = new CsvToMetadataParser(debugMaxDocs); partials = lineParser.parseDocument(lineReader, source); long memUsage = lineParser.getMemUsage(); _memUsage += memUsage; _totalMemUsage.addAndGet(memUsage); } finally { if (null != lineReader) lineReader.close(); } } //TESTED MessageDigest md5 = null; // (generates unique urls if the user doesn't below) try { md5 = MessageDigest.getInstance("MD5"); } catch (NoSuchAlgorithmException e) { // Do nothing, unlikely to happen... } int nIndex = 0; int numPartials = partials.size(); for (DocumentPojo doctoAdd : partials) { nIndex++; doctoAdd.setSource(source.getTitle()); doctoAdd.setSourceKey(source.getKey()); doctoAdd.setMediaType(source.getMediaType()); doctoAdd.setModified(new Date(fileTimestamp)); doctoAdd.setCreated(new Date()); if (null == doctoAdd.getUrl()) { // Can be set in the parser or here doctoAdd.setHasDefaultUrl(true); // (ie cannot occur in a different src URL) if (1 == numPartials) { String urlString = f.getUrlString(); if (urlString.endsWith(urlType)) { doctoAdd.setUrl(urlString); } else { doctoAdd.setUrl( new StringBuffer(urlString).append('.').append(urlType).toString()); } // (we always set sourceUrl as the true url of the file, so want to differentiate the URL with // some useful information) } else if (null == doctoAdd.getMetadata()) { // Line oriented case doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/").append(nIndex) .append('.').append(urlType).toString()); } else { if (null == md5) { // Will never happen, MD5 always exists doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/") .append(doctoAdd.getMetadata().hashCode()).append('.').append(urlType) .toString()); } else { // This is the standard call if the XML parser has not been configured to build the URL doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/") .append(DigestUtils.md5Hex(doctoAdd.getMetadata().toString())) .append('.').append(urlType).toString()); } } //TESTED } doctoAdd.setTitle(f.getName().toString()); doctoAdd.setPublishedDate(new Date(fileTimestamp)); doctoAdd.setSourceUrl(f.getUrlString()); // Always add to files because I'm deleting the source URL files.add(doctoAdd); } //TESTED } catch (XMLStreamException e1) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true); } catch (FactoryConfigurationError e1) { errors++; _context.getHarvestStatus().logMessage(e1.getMessage(), true); } catch (IOException e1) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true); } catch (Exception e1) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true); } } //(end if needs updated) } else //Tika supports Excel,Word,Powerpoint,Visio, & Outlook Documents { // (This dedup tells me if it's an add/update vs ignore - qr.isDuplicate higher up tells me if I need to add or update) if (needsUpdated_Url(modDate, f.getUrlString(), source)) { Metadata metadata = null; InputStream in = null; try { doc = new DocumentPojo(); // Create a tika object (first time only) if (null == _tika) { this.initializeTika(_context, source); } // BUGGERY // NEED TO LIKELY SET LIMIT TO BE 30MB or 50MB and BYPASS ANYTHING OVER THAT BELOW IS THE CODE TO DO THAT // tika.setMaxStringLength(30*1024*1024); // Disable the string length limit _tika.setMaxStringLength(-1); //input = new FileInputStream(new File(resourceLocation)); // Create a metadata object to contain the metadata metadata = new Metadata(); // Parse the file and get the text of the file doc.setSource(source.getTitle()); doc.setSourceKey(source.getKey()); doc.setMediaType(source.getMediaType()); String fullText = ""; in = f.getInputStream(); try { if (null == _tikaOutputFormat) { // text only fullText = _tika.parseToString(in, metadata); } //TESTED else { // XML/HMTL _tika.getParser().parse(in, _tikaOutputFormat, metadata, _tikaOutputParseContext); fullText = _tikaXmlFormatWriter.toString(); _tikaXmlFormatWriter.getBuffer().setLength(0); } //TESTED } finally { if (null != in) in.close(); } int descCap = 500; doc.setFullText(fullText); if (descCap > fullText.length()) { descCap = fullText.length(); } doc.setDescription(fullText.substring(0, descCap)); doc.setModified(new Date(fileTimestamp)); doc.setCreated(new Date()); doc.setUrl(f.getUrlString()); doc.setTitle(f.getName().toString()); doc.setPublishedDate(new Date(fileTimestamp)); long memUsage = (250L * (doc.getFullText().length() + doc.getDescription().length())) / 100L; // 25% overhead, 2x for string->byte _memUsage += memUsage; _totalMemUsage.addAndGet(memUsage); // If the metadata contains a more plausible date then use that try { String title = metadata.get(Metadata.TITLE); if (null != title) { doc.setTitle(title); } } catch (Exception e) { // Fine just carry on } try { Date date = metadata.getDate(Metadata.CREATION_DATE); // MS Word if (null != date) { doc.setPublishedDate(date); } else { date = metadata.getDate(Metadata.DATE); // Dublin if (null != date) { doc.setPublishedDate(date); } else { date = metadata.getDate(Metadata.ORIGINAL_DATE); if (null != date) { doc.setPublishedDate(date); } } } } catch (Exception e) { // Fine just carry on } //TESTED // If the metadata contains a geotag then apply that: try { String lat = metadata.get(Metadata.LATITUDE); String lon = metadata.get(Metadata.LONGITUDE); if ((null != lat) && (null != lon)) { GeoPojo gt = new GeoPojo(); gt.lat = Double.parseDouble(lat); gt.lon = Double.parseDouble(lon); doc.setDocGeo(gt); } } catch (Exception e) { // Fine just carry on } // Save the entire metadata: doc.addToMetadata("_FILE_METADATA_", metadata); for (ObjectId communityId : source.getCommunityIds()) { doc.setCommunityId(communityId); } files.add(doc); // Close the input stream in.close(); in = null; //TESTED } catch (SmbException e) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); } catch (MalformedURLException e) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); } catch (UnknownHostException e) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); } catch (IOException e) { errors++; _context.getHarvestStatus().logMessage(e.getMessage(), true); } catch (TikaException e) { errors++; _context.getHarvestStatus().logMessage(e.getMessage(), true); } catch (Exception e) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); } finally { // Close the input stream if an error occurs if (null != in) { try { in.close(); } catch (IOException e) { // All good, do nothing } } } // end exception handling } // end dedup check } // end XML vs "office" app //DEBUG //System.out.println("FILE=" + files.size() + " / MEM=" + _memUsage + " VS " + Runtime.getRuntime().totalMemory()); }
From source file:com.clustercontrol.agent.winevent.WinEventMonitor.java
/** * XMLStAX???EventLogRecord????/* w w w. j a v a2s .co m*/ * @param eventXmlStream * @return EventLogRecord? */ private ArrayList<EventLogRecord> parseEventXML(InputStream eventXmlStream) { ArrayList<EventLogRecord> eventlogs = new ArrayList<EventLogRecord>(); try { XMLInputFactory xmlif = XMLInputFactory.newInstance(); /** * OpenJDK7/OracleJDK7??"]"?2????????????????????????????? * ?XML?????????OpenJDK7/OracleJDK7???????/?????????? * URL??????????????? * * URL * http://docs.oracle.com/javase/jp/6/api/javax/xml/stream/XMLStreamReader.html#next() */ String xmlCoalescingKey = "javax.xml.stream.isCoalescing";// TODO JRE??????????????????? if (m_log.isDebugEnabled()) { m_log.debug(xmlCoalescingKey + " = true"); } xmlif.setProperty(xmlCoalescingKey, true); XMLStreamReader xmlr = xmlif.createXMLStreamReader(eventXmlStream); while (xmlr.hasNext()) { switch (xmlr.getEventType()) { case XMLStreamConstants.START_ELEMENT: m_log.trace("EventType : XMLStreamConstants.START_ELEMENT"); String localName = xmlr.getLocalName(); m_log.trace("local name : " + localName); if ("Event".equals(localName)) { EventLogRecord eventlog = new EventLogRecord(); eventlogs.add(eventlog); m_log.debug("create new EventLogRecord"); } else { String attrLocalName = null; String attrValue = null; if (xmlr.getAttributeCount() != 0) { attrLocalName = xmlr.getAttributeLocalName(0); attrValue = xmlr.getAttributeValue(0); m_log.trace("attribute local name : " + attrLocalName); m_log.trace("attribute local value : " + attrValue); } if ("Provider".equals(localName)) { if ("Name".equals(attrLocalName)) { m_log.trace("target value : " + attrValue); EventLogRecord eventlog = eventlogs.get(eventlogs.size() - 1); eventlog.setProviderName(attrValue); m_log.debug("set ProviderName : " + eventlog.getProviderName()); } } // Get-WinEvent/wevtutil.exe else if ("TimeCreated".equals(localName) && "SystemTime".equals(attrLocalName)) { m_log.trace("target value : " + attrValue); // "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS'Z'"???S???????????? String formatedDateString = attrValue.replaceAll("\\..*Z", ""); m_log.trace("formatted target value : " + formatedDateString); DateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss"); sdf.setTimeZone(TimeZone.getTimeZone("UTC")); EventLogRecord eventlog = eventlogs.get(eventlogs.size() - 1); ; try { eventlog.setTimeCreated(sdf.parse(formatedDateString)); } catch (ParseException e) { // do nothing m_log.error("set TimeCreated Error", e); } m_log.debug("set TimeCreated : " + eventlog.getTimeCreated()); } // Get-EventLog if ("TimeGenerated".equals(localName) && "SystemTime".equals(attrLocalName)) { m_log.trace("target value : " + attrValue); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss'Z'"); sdf.setTimeZone(HinemosTime.getTimeZone()); EventLogRecord eventlog = eventlogs.get(eventlogs.size() - 1); ; try { eventlog.setTimeCreated(sdf.parse(attrValue)); } catch (ParseException e) { // do nothing m_log.error("set TimeCreated Error", e); } m_log.debug("set TimeCreated : " + eventlog.getTimeCreated()); } else { targetProperty = localName; m_log.trace("target property : " + targetProperty); } } break; case XMLStreamConstants.SPACE: case XMLStreamConstants.CHARACTERS: m_log.trace("EventType : XMLStreamConstants.CHARACTERS, length=" + xmlr.getTextLength()); if (targetProperty != null) { try { EventLogRecord eventlog = eventlogs.get(eventlogs.size() - 1); ; if ("EventID".equals(targetProperty)) { eventlog.setId(Integer.parseInt(new String(xmlr.getTextCharacters(), xmlr.getTextStart(), xmlr.getTextLength()))); m_log.debug("set EventID : " + eventlog.getId()); } // Get-WinEvent/wevtutil.exe else if ("Level".equals(targetProperty)) { if (eventlog.getLevel() == WinEventConstant.UNDEFINED) { eventlog.setLevel(Integer.parseInt(new String(xmlr.getTextCharacters(), xmlr.getTextStart(), xmlr.getTextLength()))); m_log.debug("set Level : " + eventlog.getLevel()); } } else if ("Task".equals(targetProperty)) { if (eventlog.getTask() == WinEventConstant.UNDEFINED) { eventlog.setTask(Integer.parseInt(new String(xmlr.getTextCharacters(), xmlr.getTextStart(), xmlr.getTextLength()))); m_log.debug("set Task : " + eventlog.getTask()); } } else if ("Keywords".equals(targetProperty)) { // TODO ????????0x8080000000000000 //eventlog.setKeywords(Long.decode(new String(xmlr.getTextCharacters(), xmlr.getTextStart(), xmlr.getTextLength()))); //m_log.debug("set Keywords : " + eventlog.getKeywords()); } else if ("EventRecordId".equals(targetProperty)) { eventlog.setRecordId(Long.parseLong(new String(xmlr.getTextCharacters(), xmlr.getTextStart(), xmlr.getTextLength()))); m_log.debug("set RecordId : " + eventlog.getRecordId()); } else if ("Channel".equals(targetProperty)) { eventlog.setLogName(new String(xmlr.getTextCharacters(), xmlr.getTextStart(), xmlr.getTextLength())); m_log.debug("set LogName : " + eventlog.getLogName()); } else if ("Computer".equals(targetProperty)) { eventlog.setMachineName(new String(xmlr.getTextCharacters(), xmlr.getTextStart(), xmlr.getTextLength())); m_log.debug("set MachineName : " + eventlog.getMachineName()); } else if ("Message".equals(targetProperty)) { String message = new String(xmlr.getTextCharacters(), xmlr.getTextStart(), xmlr.getTextLength()); message = message.replaceAll(tmpReturnCode, "\r\n"); message = message.replaceAll(tmpLtCode, "<"); message = message.replaceAll(tmpGtCode, ">"); eventlog.setMessage(message); m_log.debug("set Message : " + eventlog.getMessage()); } else if ("Data".equals(targetProperty)) { String data = new String(xmlr.getTextCharacters(), xmlr.getTextStart(), xmlr.getTextLength()); eventlog.getData().add(data); m_log.debug("set Data : " + data); } else { m_log.debug("unknown target property : " + targetProperty); } } catch (NumberFormatException e) { m_log.debug("number parse error", e); } } targetProperty = null; break; default: // break; } xmlr.next(); } xmlr.close(); } catch (XMLStreamException e) { m_log.warn("parseEvent() xmlstream error", e); } return eventlogs; }
From source file:edu.harvard.iq.safe.lockss.impl.LOCKSSDaemonStatusTableXmlStreamParser.java
/** * * @param stream//from ww w . jav a 2 s . com * @param encoding */ @Override public void read(InputStream stream, String encoding) { // logger.setLevel(Level.FINE); // 1. create Input factory XMLInputFactory xmlif = XMLInputFactory.newInstance(); xmlif.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE); xmlif.setProperty("javax.xml.stream.isNamespaceAware", java.lang.Boolean.TRUE); long startTime = System.currentTimeMillis(); int noAUs = 0; String aus = null; String currentTableId = null; String currentTableTitle = null; String currentTableKey = null; boolean hasErrorsColumn = false; String siAuId = null; XMLStreamReader xmlr = null; try { // create reader xmlr = xmlif.createXMLStreamReader(new BufferedInputStream(stream), encoding); String curElement = ""; boolean isLastTagnameTable = false; String targetTagName = "row"; String cellTagName = "columnname"; boolean withinSummaryinfo = false; boolean withinColumndescriptor = false; boolean withinRow = false; boolean withinCell = false; boolean withinReference = false; boolean isCrawlStatusActive = false; boolean isCrawlStatusColumn = false; int valueTagCounter = 0; String currentColumnName = null; String currentCellValue = null; String currentCellKey = null; SummaryInfo si = null; List<String> rowData = null; Map<String, String> rowDataH = null; w1: while (xmlr.hasNext()) { int eventType = xmlr.next(); switch (eventType) { case XMLStreamConstants.START_ELEMENT: curElement = xmlr.getLocalName(); // note: getName() -> // QName logger.log(Level.FINE, "--------- start tag = <{0}> ---------", curElement); // check the table name first if (curElement.equals("table")) { isLastTagnameTable = true; } else if (curElement.equals("error")) { isTargetPageValid = false; break w1; } if (isLastTagnameTable) { if (curElement.equals("name")) { currentTableId = xmlr.getElementText(); logger.log(Level.FINE, "########### table Id = [{0}] ###########", currentTableId); tableId = currentTableId; if (belongsInclusionTableList.contains(currentTableId)) { logger.log(Level.FINE, "!!!!! Table ({0}) belongs to the target list !!!!!", currentTableId); } else { logger.log(Level.FINE, "XXXXXXXXXXX Table ({0}) does not belong to the target list XXXXXXXXXXX", currentTableId); break w1; } } else if (curElement.equals("key")) { currentTableKey = xmlr.getElementText(); logger.log(Level.FINE, "---------- table key = ({0}) ----------", currentTableKey); tableKey = currentTableKey; } else if (curElement.equals("title")) { currentTableTitle = xmlr.getElementText(); logger.log(Level.FINE, "+++++++++ table Title = ({0}) +++++++++", currentTableTitle); if (currentTableId.equals("PeerRepair")) { if (currentTableTitle.startsWith("Repair candidates for AU: ")) { currentTableTitle = currentTableTitle.replaceFirst("Repair candidates for AU: ", ""); logger.log(Level.FINE, "save this modified table-Title as auName={0}", currentTableTitle); this.tableTitle = currentTableTitle; } else { logger.log(Level.WARNING, "The table-Title does not start with the expected token={0}", currentTableTitle); } } isLastTagnameTable = false; } } if (curElement.equals("columndescriptor")) { withinColumndescriptor = true; } else if (curElement.equals("row")) { withinRow = true; rowCounter++; logger.log(Level.FINE, "================== {0}-th row starts here ==================", rowCounter); // set-up the table storage //if (rowCounter == 1) { // 1st row rowData = new ArrayList<String>(); rowDataH = new LinkedHashMap<String, String>(); //} } else if (curElement.equals("cell")) { logger.log(Level.FINE, "entering a cell"); withinCell = true; } else if (curElement.equals("reference")) { withinReference = true; logger.log(Level.FINE, "within reference on"); } else if (curElement.equals("summaryinfo")) { withinSummaryinfo = true; si = new SummaryInfo(); } else if (curElement.equals("value")) { logger.log(Level.FINE, "entering a value"); valueTagCounter++; } //---- columndescriptor tag --------------------------------------------------- if (withinColumndescriptor) { if (curElement.equals("name")) { String nameText = xmlr.getElementText(); logger.log(Level.FINE, "\tcolumndescriptor: name = {0}", nameText); columndescriptorList.add(nameText); } else if (curElement.equals("title")) { String titleText = xmlr.getElementText(); logger.log(Level.FINE, "\tcolumndescriptor: title = {0}", titleText); } else if (curElement.equals("type")) { String typeText = xmlr.getElementText(); logger.log(Level.FINE, "\tcolumndescriptor: type = {0}", typeText); getTypeList().add(typeText); } } //---- cell tag ---------------------------------------------------------------- if (withinCell) { logger.log(Level.FINE, "parsing withinCell"); if (curElement.equals("columnname")) { String columnname = xmlr.getElementText(); logger.log(Level.FINE, "\t\tcolumnname = {0}", columnname); currentColumnName = columnname; if (columnname.equals("crawl_status")) { isCrawlStatusColumn = true; } else { isCrawlStatusColumn = false; } if (columnname.equals("Errors")) { hasErrorsColumn = true; } } else { // value tag block: either value-tag WO a child element // or with a child element /* * <value><reference>...<value>xxxx</value> * <value>xxxx</value> */ if ((curElement.equals("value")) && (!withinReference)) { logger.log(Level.FINE, "entering el:value/WO-REF block"); if (!hasReferenceTag.contains(currentColumnName)) { logger.log(Level.FINE, "No child reference tag is expected for this value tag"); logger.log(Level.FINEST, "xmlr.getEventType():pre-parsing={0}", xmlr.getEventType()); String cellValue = xmlr.getElementText(); // note: the above parsing action moves the // cursor to the end-tag, i.e., </value> // therefore, the end-element-switch-block below // cannot catch this </value> tag logger.log(Level.FINE, "\t\t\t[No ref: value] {0} = {1}", new Object[] { currentColumnName, cellValue }); currentCellValue = cellValue; logger.log(Level.FINEST, "xmlr.getEventType():post-parsing={0}", xmlr.getEventType()); // store this value // rowData logger.log(Level.FINE, "current column name={0}", currentColumnName); logger.log(Level.FINE, "valueTagCounter={0}", valueTagCounter); if (currentColumnName.endsWith("Damaged")) { if (valueTagCounter <= 1) { // 2nd value tag is footnot for this column // ignore this value rowData.add(cellValue); rowDataH.put(currentColumnName, currentCellValue); } } else { rowData.add(cellValue); rowDataH.put(currentColumnName, currentCellValue); } } else { // previously this block was unthinkable, but // it was found that there are columns that // temporarily have a <reference> tag in // crawl_status_table; these columns are // included in hasReferenceTag by default; // thus, for such unstable columns, // when they hava a <reference tag, // data are caputred in another within- // reference block; however, when these // columns no longer have <reference> tag, // text data would be left uncaptured unless // some follow-up processing takes place here logger.log(Level.FINE, "May have to capture data: column={0}", currentColumnName); if (mayHaveReferenceTag.contains(currentColumnName) && !isCrawlStatusActive) { // because the crawling is not active, // it is safely assume that the maybe columns have no reference tag // 2011-10-24 the above assumption was found wrong // a crawling cell does not say active but // subsequent columns have a reference logger.log(Level.FINE, "a text or a reference tag : try to parse it as a text"); String cellValue = null; try { cellValue = xmlr.getElementText(); } catch (javax.xml.stream.XMLStreamException ex) { continue; } finally { } logger.log(Level.FINE, "\t\t\t[value WO-ref(crawling_NOT_active case)={0}]", currentColumnName + " = " + cellValue); currentCellValue = cellValue; // store this value // rowData logger.log(Level.FINE, "\t\t\tcurrent columnName={0}", currentColumnName); rowData.add(cellValue); rowDataH.put(currentColumnName, currentCellValue); } else { logger.log(Level.FINE, "WO-Ref: no processing items now:{0}", curElement); } } } else if (withinReference) { // reference tag exists logger.log(Level.FINE, "WR:curElement={0}", curElement); if (curElement.equals("key")) { String cellKey = xmlr.getElementText(); logger.log(Level.FINE, "\t\tcurrentCellKey is set to={0}", cellKey); currentCellKey = cellKey; } else if (curElement.equals("value")) { String cellValue = xmlr.getElementText(); logger.log(Level.FINE, "\t\twr: {0} = {1}", new Object[] { currentColumnName, cellValue }); // exception cases follow: if (currentColumnName.equals("AuName")) { logger.log(Level.FINE, "\t\tAuName is replaced with the key[=AuId]= {0}", currentCellKey); // rowData // This block is for ArchivalUnitStatusTable // add the key as a new datum (auId) // ahead of its value rowData.add(currentCellKey); rowDataH.put("AuId", currentCellKey); currentCellValue = cellValue; } else if (currentColumnName.equals("auId")) { // This block is for V3PollerTable logger.log(Level.FINE, "\t\tnew value for auId(V3PollerTable)={0}", currentCellKey); // deprecated after 2012-02-02: use key as data // currentCellValue = currentCellKey; // add auName as a new column ahead of auId rowData.add(cellValue); rowDataH.put("auName", cellValue); logger.log(Level.FINE, "\t\tauName(V3PollerTable)={0}", cellValue); currentCellValue = currentCellKey; } else if (currentColumnName.equals("pollId")) { // this block is for V3PollerTable logger.log(Level.FINE, "\t\tFull string (key) is used={0}", currentCellKey); // The key has the complete string whereas // the value is its truncated copy currentCellValue = currentCellKey; } else if (currentColumnName.equals("au")) { logger.log(Level.FINE, "\t\tauId is used instead for au(crawl_status_table)={0}", currentCellKey); // 2012-02-02: add auName ahead of au rowData.add(cellValue); rowDataH.put("auName", cellValue); logger.log(Level.FINE, "\t\tauName={0}", cellValue); // rowData // This block is for crawl_status_table // save the key(auId) instead of value currentCellValue = currentCellKey; } else if (currentColumnName.equals("Peers")) { logger.log(Level.FINE, "\t\tURL (key) is used={0}", currentCellKey); currentCellValue = DaemonStatusDataUtil.escapeHtml(currentCellKey); logger.log(Level.FINE, "\t\tAfter encoding ={0}", currentCellValue); } else { if (isCrawlStatusColumn) { // if the craw status column is // "active", some later columns // may have a reference tag // so turn on the switch if (cellValue.equals("Active") || (cellValue.equals("Pending"))) { isCrawlStatusActive = true; } else { isCrawlStatusActive = false; } } // the default processing currentCellValue = cellValue; } // store currentCellValue logger.log(Level.FINE, "currentCellValue={0}", currentCellValue); // rowData rowData.add(currentCellValue); rowDataH.put(currentColumnName, currentCellValue); } // Within ref tag: key and valu processing } // value with text or value with ref tag } // columnname or value } // within cell // ---- summaryinfo tag -------------------------------------------------------- if (withinSummaryinfo) { logger.log(Level.FINE, "============================ Within SummaryInfo ============================ "); if (curElement.equals("title")) { String text = xmlr.getElementText(); si.setTitle(text); logger.log(Level.FINE, "\tsi:titile={0}", si.getTitle()); } else if (curElement.equals("type")) { String text = xmlr.getElementText(); si.setType(Integer.parseInt(text)); logger.log(Level.FINE, "\tsi:type={0}", si.getType()); } else if (curElement.equals("key")) { if (withinReference && si.getTitle().equals("Volume")) { String text = xmlr.getElementText(); logger.log(Level.FINE, "\tsi:key contents(Volume case)={0}", text); siAuId = text; // si.setValue(text); logger.log(Level.FINE, "\tsi:value(Volume case)={0}", siAuId); } } else if (curElement.equals("value")) { if (withinReference) { if (hasRefTitileTagsSI.contains(si.getTitle())) { if (si.getTitle().equals("Volume")) { // 2012-02-02 use the au name String text = xmlr.getElementText(); si.setValue(text); logger.log(Level.FINE, "\tsi:value(Volume case)={0}", si.getValue()); } else { String text = xmlr.getElementText(); si.setValue(text); logger.log(Level.FINE, "\tsi:value={0}", si.getValue()); } } } else { // note: 2012-02-07 // daemon 1.59.2 uses the new layout for AU page // this layout includes a summaryinfo tag // that now contains a reference tag String text = null; try { text = xmlr.getElementText(); if (!hasRefTitileTagsSI.contains(si.getTitle())) { si.setValue(text); logger.log(Level.FINE, "\tsi:value={0}", si.getValue()); } } catch (javax.xml.stream.XMLStreamException ex) { logger.log(Level.WARNING, "encounter a reference tag rather than text"); continue; } finally { } } } /* * aus = xmlr.getElementText(); * out.println("found token=[" + aus + "]"); if * (currentTableId.equals("ArchivalUnitStatusTable")) { * m = pau.matcher(aus); if (m.find()) { * out.println("How many AUs=" + m.group(1)); noAUs = * Integer.parseInt(m.group(1)); } else { * out.println("not found within[" + aus + "]"); } } */ } break; case XMLStreamConstants.CHARACTERS: break; case XMLStreamConstants.ATTRIBUTE: break; case XMLStreamConstants.END_ELEMENT: if (xmlr.getLocalName().equals("columndescriptor")) { withinColumndescriptor = false; logger.log(Level.FINE, "leaving columndescriptor"); } else if (xmlr.getLocalName().equals("row")) { if (withinRow) { logger.log(Level.FINE, "========= end of the target row element"); withinRow = false; } if (!isCrawlStatusActive) { tabularData.add(rowData); tableData.add(rowDataH); } else { rowIgnored++; rowCounter--; } rowData = null; rowDataH = null; isCrawlStatusActive = false; } else if (xmlr.getLocalName().equals("cell")) { // rowDataH.add(cellDatum); cellCounter++; withinCell = false; currentColumnName = null; currentCellValue = null; currentCellKey = null; isCrawlStatusColumn = false; valueTagCounter = 0; logger.log(Level.FINE, "leaving cell"); } else if (xmlr.getLocalName().equals("columnname")) { logger.log(Level.FINE, "leaving columnname"); } else if (xmlr.getLocalName().equals("reference")) { withinReference = false; } else if (xmlr.getLocalName().equals("summaryinfo")) { logger.log(Level.FINE, "si={0}", si.toString()); summaryInfoList.add(si); si = null; withinSummaryinfo = false; } else if (xmlr.getLocalName().equals("value")) { logger.log(Level.FINE, "leaving value"); } else { logger.log(Level.FINE, "--------- end tag = <{0}> ---------", curElement); } break; case XMLStreamConstants.END_DOCUMENT: logger.log(Level.FINE, "Total of {0} row occurrences", rowCounter); } // end: switch } // end:while } catch (XMLStreamException ex) { logger.log(Level.WARNING, "XMLStreamException occurs", ex); this.isTargetPageValid = false; } catch (RuntimeException re) { logger.log(Level.WARNING, "some RuntimeException occurs", re); this.isTargetPageValid = false; } catch (Exception e) { logger.log(Level.WARNING, "some Exception occurs", e); this.isTargetPageValid = false; } finally { // 5. close reader/IO if (xmlr != null) { try { xmlr.close(); } catch (XMLStreamException ex) { logger.log(Level.WARNING, "XMLStreamException occurs during close()", ex); } } if (!this.isTargetPageValid) { logger.log(Level.WARNING, "This parsing session may not be complete due to some exception reported earlier"); } } // end of try if (currentTableId.equals("V3PollerDetailTable")) { summaryInfoList.add(new SummaryInfo("auId", 4, siAuId)); summaryInfoMap = new LinkedHashMap<String, String>(); for (SummaryInfo si : summaryInfoList) { summaryInfoMap.put(si.getTitle(), si.getValue()); } } // parsing summary logger.log(Level.FINE, "###################### parsing summary ######################"); logger.log(Level.FINE, "currentTableId={0}", currentTableId); logger.log(Level.FINE, "currentTableTitle={0}", currentTableTitle); logger.log(Level.FINE, "currentTableKey={0}", currentTableKey); logger.log(Level.FINE, "columndescriptorList={0}", columndescriptorList); logger.log(Level.FINE, "# of columndescriptors={0}", columndescriptorList.size()); logger.log(Level.FINE, "typeList={0}", typeList); logger.log(Level.FINE, "# of rows counted={0}", rowCounter); logger.log(Level.FINE, "# of rows excluded[active ones are excluded]={0}", rowIgnored); logger.log(Level.FINE, "summaryInfoList:size={0}", summaryInfoList.size()); logger.log(Level.FINE, "summaryInfoList={0}", summaryInfoList); logger.log(Level.FINE, "table: cell counts = {0}", cellCounter); logger.log(Level.FINE, "tableData[map]=\n{0}", tableData); logger.log(Level.FINE, "tabularData[list]=\n{0}", tabularData); /* * if (currentTableId.equals("ArchivalUnitStatusTable")) { if * (rowCounter == noAUs) { out.println("au counting is OK=" + * rowCounter); } else { err.println("au counting disagreement"); throw * new RuntimeException("parsing error is suspected"); } } */ logger.log(Level.FINE, " completed in {0} ms\n\n", (System.currentTimeMillis() - startTime)); if (!columndescriptorList.isEmpty()) { int noCols = columndescriptorList.size(); if (currentTableId.equals("V3PollerTable") && !hasErrorsColumn) { noCols--; } int noCellsExpd = rowCounter * noCols; if (noCols > 0) { // this table has a table logger.log(Level.FINE, "checking parsing results: table dimmensions"); if (noCellsExpd == cellCounter) { logger.log(Level.FINE, "table dimensions and cell-count are consistent"); } else { int diff = noCellsExpd - cellCounter; logger.log(Level.FINE, "The table has {0} incomplete cells", diff); hasIncompleteRows = true; setIncompleteRowList(); logger.log(Level.FINE, "incomplete rows: {0}", incompleteRows); } } } }
From source file:com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester.java
/** * processMeta - handle an individual field *//*from w w w .ja v a2 s. co m*/ private void processMeta(DocumentPojo f, metaField m, String text, SourcePojo source, UnstructuredAnalysisConfigPojo uap) { boolean bAllowDuplicates = false; if ((null != m.flags) && m.flags.contains("U")) { bAllowDuplicates = true; } if ((null == m.scriptlang) || m.scriptlang.equalsIgnoreCase("regex")) { Pattern metaPattern = createRegex(m.script, m.flags); int timesToRun = 1; Object[] currField = null; if ((null != m.flags) && m.flags.contains("c")) { currField = f.getMetadata().get(m.fieldName); } if (null != currField) { // chained metadata timesToRun = currField.length; text = (String) currField[0]; } //TESTED Matcher matcher = metaPattern.matcher(text); LinkedList<String> Llist = null; for (int ii = 0; ii < timesToRun; ++ii) { if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above) text = (String) currField[ii]; matcher = metaPattern.matcher(text); } //TESTED StringBuffer prefix = new StringBuffer(m.fieldName).append(':'); int nFieldNameLen = m.fieldName.length() + 1; try { while (matcher.find()) { if (null == Llist) { Llist = new LinkedList<String>(); } if (null == m.groupNum) { m.groupNum = 0; } String toAdd = matcher.group(m.groupNum); if (null != m.replace) { toAdd = metaPattern.matcher(toAdd).replaceFirst(m.replace); } if ((null != m.flags) && m.flags.contains("H")) { toAdd = StringEscapeUtils.unescapeHtml(toAdd); } prefix.setLength(nFieldNameLen); prefix.append(toAdd); String dupCheck = prefix.toString(); if (!regexDuplicates.contains(dupCheck)) { Llist.add(toAdd); if (!bAllowDuplicates) { regexDuplicates.add(dupCheck); } } } } catch (Exception e) { this._context.getHarvestStatus().logMessage("processMeta1: " + e.getMessage(), true); } } //(end metadata chaining handling) if (null != Llist) { if (null != currField) { // (overwrite) f.getMetadata().put(m.fieldName, Llist.toArray()); } else { f.addToMetadata(m.fieldName, Llist.toArray()); } } //TESTED } else if (m.scriptlang.equalsIgnoreCase("javascript")) { if (null == f.getMetadata()) { f.setMetadata(new LinkedHashMap<String, Object[]>()); } //set the script engine up if necessary if ((null != source) && (null != uap)) { //(these are null if called from new processing pipeline vs legacy code) intializeScriptEngine(source, uap); } try { //TODO (INF-2488): in new format, this should only happen in between contentMeta blocks/docs // (also should be able to use SAH _document object I think?) // Javascript: the user passes in Object[] currField = f.getMetadata().get(m.fieldName); if ((null == m.flags) || m.flags.isEmpty()) { if (null == currField) { engine.put("text", text); engine.put("_iterator", null); } //(otherwise will just pass the current fields in there) } else { // flags specified if (m.flags.contains("t")) { // text engine.put("text", text); } if (m.flags.contains("d")) { // entire document (minus ents and assocs) GsonBuilder gb = new GsonBuilder(); Gson g = gb.create(); List<EntityPojo> ents = f.getEntities(); List<AssociationPojo> assocs = f.getAssociations(); try { f.setEntities(null); f.setAssociations(null); engine.put("document", g.toJson(f)); securityManager.eval(engine, JavaScriptUtils.initScript); } finally { f.setEntities(ents); f.setAssociations(assocs); } } if (m.flags.contains("m")) { // metadata GsonBuilder gb = new GsonBuilder(); Gson g = gb.create(); engine.put("_metadata", g.toJson(f.getMetadata())); securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript); } } //(end flags processing) if (null != currField) { f.getMetadata().remove(m.fieldName); GsonBuilder gb = new GsonBuilder(); Gson g = gb.create(); engine.put("_iterator", g.toJson(currField)); securityManager.eval(engine, JavaScriptUtils.iteratorDocScript); } //TESTED (handling of flags, and replacing of existing fields, including when field is null but specified) Object returnVal = securityManager.eval(engine, m.script); if (null != returnVal) { if (returnVal instanceof String) { // The only easy case Object[] array = new Object[1]; if ((null != m.flags) && m.flags.contains("H")) { returnVal = StringEscapeUtils.unescapeHtml((String) returnVal); } array[0] = returnVal; f.addToMetadata(m.fieldName, array); } else { // complex object or array - in either case the engine turns these into // internal.NativeArray or internal.NativeObject BasicDBList outList = JavaScriptUtils.parseNativeJsObject(returnVal, engine); f.addToMetadata(m.fieldName, outList.toArray()); } } } catch (ScriptException e) { _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); // Just do nothing and log // e.printStackTrace(); //DEBUG (don't output log messages per doc) //logger.error(e.getMessage()); } catch (Exception e) { _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); // Just do nothing and log // e.printStackTrace(); //DEBUG (don't output log messages per doc) //logger.error(e.getMessage()); } } else if (m.scriptlang.equalsIgnoreCase("xpath")) { String xpath = m.script; try { createHtmlCleanerIfNeeded(); int timesToRun = 1; Object[] currField = null; if ((null != m.flags) && m.flags.contains("c")) { currField = f.getMetadata().get(m.fieldName); } if (null != currField) { // chained metadata f.getMetadata().remove(m.fieldName); // (so will add to the end) timesToRun = currField.length; text = (String) currField[0]; } //TESTED for (int ii = 0; ii < timesToRun; ++ii) { if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above) text = (String) currField[ii]; } //TESTED TagNode node = cleaner.clean(new ByteArrayInputStream(text.getBytes())); //NewCode : Only use html cleaner for cleansing //use JAXP for full Xpath lib Document doc = new DomSerializer(new CleanerProperties()).createDOM(node); String extraRegex = extractRegexFromXpath(xpath); if (extraRegex != null) xpath = xpath.replace(extraRegex, ""); XPath xpa = XPathFactory.newInstance().newXPath(); NodeList res = (NodeList) xpa.evaluate(xpath, doc, XPathConstants.NODESET); if (res.getLength() > 0) { if ((null != m.flags) && (m.flags.contains("o"))) { // "o" for object m.groupNum = -1; // (see bConvertToObject below) } StringBuffer prefix = new StringBuffer(m.fieldName).append(':'); int nFieldNameLen = m.fieldName.length() + 1; ArrayList<Object> Llist = new ArrayList<Object>(res.getLength()); boolean bConvertToObject = ((m.groupNum != null) && (m.groupNum == -1)); boolean convertToXml = ((null != m.flags) && (m.flags.contains("x"))); for (int i = 0; i < res.getLength(); i++) { Node info_node = res.item(i); if ((null != m.flags) && (m.flags.contains("g"))) { Llist.add(parseHtmlTable(info_node, m.replace)); } else if (bConvertToObject || convertToXml) { // Try to create a JSON object out of this StringWriter writer = new StringWriter(); try { Transformer transformer = TransformerFactory.newInstance().newTransformer(); transformer.transform(new DOMSource(info_node), new StreamResult(writer)); } catch (TransformerException e1) { continue; } if (bConvertToObject) { try { JSONObject subObj = XML.toJSONObject(writer.toString()); if (xpath.endsWith("*")) { // (can have any number of different names here) Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj)); } //TESTED else { String[] rootNames = JSONObject.getNames(subObj); if (1 == rootNames.length) { // (don't think it can't be any other number in fact) subObj = subObj.getJSONObject(rootNames[0]); } boolean bUnescapeHtml = ((null != m.flags) && m.flags.contains("H")); Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj, bUnescapeHtml)); } //TESTED } catch (JSONException e) { // Just carry on continue; } //TESTED } else { // leave in XML form Llist.add(writer.toString().substring(38)); // +38: (step over <?xml version="1.0" encoding="UTF-8"?>) } //TESTED (xpath_test.json) } else { // Treat this as string, either directly or via regex String info = info_node.getTextContent().trim(); if (extraRegex == null || extraRegex.isEmpty()) { prefix.setLength(nFieldNameLen); prefix.append(info); String dupCheck = prefix.toString(); if (!regexDuplicates.contains(dupCheck)) { if ((null != m.flags) && m.flags.contains("H")) { info = StringEscapeUtils.unescapeHtml(info); } Llist.add(info); if (!bAllowDuplicates) { regexDuplicates.add(dupCheck); } } } else { // Apply regex to the string Pattern dataRegex = createRegex(extraRegex, m.flags); Matcher dataMatcher = dataRegex.matcher(info); boolean result = dataMatcher.find(); while (result) { String toAdd; if (m.groupNum != null) toAdd = dataMatcher.group(m.groupNum); else toAdd = dataMatcher.group(); prefix.setLength(nFieldNameLen); prefix.append(toAdd); String dupCheck = prefix.toString(); if (!regexDuplicates.contains(dupCheck)) { if ((null != m.flags) && m.flags.contains("H")) { toAdd = StringEscapeUtils.unescapeHtml(toAdd); } Llist.add(toAdd); if (!bAllowDuplicates) { regexDuplicates.add(dupCheck); } } result = dataMatcher.find(); } } //(regex vs no regex) } //(end string vs object) } if (Llist.size() > 0) { f.addToMetadata(m.fieldName, Llist.toArray()); } } } //(end loop over metadata objects if applicable) } catch (IOException ioe) { _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(ioe).toString(), true); // Just do nothing and log //DEBUG (don't output log messages per doc) //logger.error(ioe.getMessage()); } catch (ParserConfigurationException e1) { _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true); // Just do nothing and log //DEBUG (don't output log messages per doc) //logger.error(e1.getMessage()); } catch (XPathExpressionException e1) { _context.getHarvestStatus().logMessage("Error evaluating xpath expression: " + xpath, true); } } else if (m.scriptlang.equalsIgnoreCase("stream")) { // XML or JSON streaming interface // which one? try { boolean json = false; boolean xml = false; for (int i = 0; i < 128; ++i) { if ('<' == text.charAt(i)) { xml = true; break; } if ('{' == text.charAt(i) || '[' == text.charAt(i)) { json = true; break; } if (!Character.isSpaceChar(text.charAt(i))) { break; } } //TESTED (too many spaces: meta_stream_test, test4; incorrect chars: test3, xml: test1, json: test2) boolean textNotObject = m.flags == null || !m.flags.contains("o"); List<DocumentPojo> docs = new LinkedList<DocumentPojo>(); List<String> levelOneFields = null; if (null != m.script) { levelOneFields = Arrays.asList(m.script.split("\\s*,\\s*")); if ((1 == levelOneFields.size()) && levelOneFields.get(0).isEmpty()) { // convert [""] to null levelOneFields = null; } } //TESTED (json and xml) if (xml) { XmlToMetadataParser parser = new XmlToMetadataParser(levelOneFields, null, null, null, null, null, Integer.MAX_VALUE); XMLInputFactory factory = XMLInputFactory.newInstance(); factory.setProperty(XMLInputFactory.IS_COALESCING, true); factory.setProperty(XMLInputFactory.SUPPORT_DTD, false); XMLStreamReader reader = null; try { reader = factory.createXMLStreamReader(new ByteArrayInputStream(text.getBytes())); docs = parser.parseDocument(reader, textNotObject); } finally { if (null != reader) reader.close(); } } //TESTED (meta_stream_test, test1) if (json) { JsonReader jsonReader = null; try { JsonToMetadataParser parser = new JsonToMetadataParser(null, levelOneFields, null, null, Integer.MAX_VALUE); jsonReader = new JsonReader( new InputStreamReader(new ByteArrayInputStream(text.getBytes()), "UTF-8")); jsonReader.setLenient(true); docs = parser.parseDocument(jsonReader, textNotObject); } finally { if (null != jsonReader) jsonReader.close(); } } //TESTED (meta_stream_test test2) if (!docs.isEmpty()) { ArrayList<String> Llist = null; ArrayList<Object> LlistObj = null; if (textNotObject) { Llist = new ArrayList<String>(docs.size()); } else { LlistObj = new ArrayList<Object>(docs.size()); } for (DocumentPojo doc : docs) { if ((null != doc.getFullText()) || (null != doc.getMetadata())) { if (textNotObject) { Llist.add(doc.getFullText()); } //TESTED else if (xml) { LlistObj.add(doc.getMetadata()); } //TESTED else if (json) { Object o = doc.getMetadata(); if (null != o) { o = doc.getMetadata().get("json"); if (o instanceof Object[]) { LlistObj.addAll(Arrays.asList((Object[]) o)); } else if (null != o) { LlistObj.add(o); } //TESTED } } //TESTED } } //TESTED if ((null != Llist) && !Llist.isEmpty()) { f.addToMetadata(m.fieldName, Llist.toArray()); } //TESTED if ((null != LlistObj) && !LlistObj.isEmpty()) { f.addToMetadata(m.fieldName, LlistObj.toArray()); } //TESTED } //TESTED (meta_stream_test test1,test2) } //(end try) catch (Exception e) { // various parsing errors _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); } } //TESTED (meta_stream_test) // (don't currently support other script types) }
From source file:net.xy.jcms.controller.configurations.parser.TranslationParser.java
/** * parses an xml configuration from an input streams. throwes * IllegalArgumentExceptions in case of syntax error. * /*from w w w . j a v a 2s. co m*/ * @param in * @return value * @throws XMLStreamException * @throws ClassNotFoundException * in case there are problems with an params type converter */ public static TranslationRule[] parse(final InputStream in, final ClassLoader loader) throws XMLStreamException, ClassNotFoundException { @SuppressWarnings("deprecation") final XMLInputFactory factory = XMLInputFactory.newInstance( "com.sun.xml.internal.stream.XMLInputFactoryImpl", TranslationParser.class.getClassLoader()); LOG.info("XMLInputFactory loaded: " + factory.getClass().getName()); factory.setProperty("javax.xml.stream.isCoalescing", true); // not supported be the reference implementation // factory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.TRUE); final XMLStreamReader parser = factory.createXMLStreamReader(in); while (parser.hasNext()) { final int event = parser.next(); if (event == XMLStreamConstants.START_ELEMENT && parser.getName().getLocalPart().equals("rules")) { return parseRules(parser, loader); } } throw new IllegalArgumentException("No rules section found."); }
From source file:net.xy.jcms.controller.configurations.parser.TranslationParser.java
/** * parses an single file translation//from ww w. ja v a2s. c o m * * @param in * @param loader * @return value * @throws XMLStreamException * @throws ClassNotFoundException * in case there are problems with an params type converter */ public static TranslationRule parseSingle(final InputStream in, final ClassLoader loader) throws XMLStreamException, ClassNotFoundException { @SuppressWarnings("deprecation") final XMLInputFactory factory = XMLInputFactory.newInstance( "com.sun.xml.internal.stream.XMLInputFactoryImpl", TranslationParser.class.getClassLoader()); LOG.info("XMLInputFactory loaded: " + factory.getClass().getName()); factory.setProperty("javax.xml.stream.isCoalescing", true); final XMLStreamReader parser = factory.createXMLStreamReader(in); while (parser.hasNext()) { final int event = parser.next(); if (event == XMLStreamConstants.START_ELEMENT && parser.getName().getLocalPart().equals("rule")) { return parseRule(parser, loader); } } throw new IllegalArgumentException("No rules section found."); }
From source file:net.xy.jcms.controller.configurations.parser.UsecaseParser.java
/** * parses usecases out from an xml file/*from w w w . j a va 2 s. c o m*/ * * @param in * @param loader * used for retrieving configuration included resources and also * for retrieving the controllers * @return value * @throws XMLStreamException * @throws ClassNotFoundException */ public static Usecase[] parse(final InputStream in, final ClassLoader loader) throws XMLStreamException, ClassNotFoundException { final XMLInputFactory factory = XMLInputFactory.newInstance(); factory.setProperty("javax.xml.stream.isCoalescing", true); // not supported by the reference implementation // factory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.TRUE); final XMLStreamReader parser = factory.createXMLStreamReader(in); while (parser.hasNext()) { final int event = parser.next(); if (event == XMLStreamConstants.START_ELEMENT && parser.getName().getLocalPart().equals("usecases")) { return parseUsecases(parser, loader); } } throw new IllegalArgumentException("No usecases section found. [" + parser.getLocation() + "]"); }
From source file:net.xy.jcms.controller.configurations.parser.UsecaseParser.java
/** * method for parsing single usecase xml files. one per file. * // w ww .ja v a2s . c o m * @param in * @param loader * @return parsed usecase * @throws XMLStreamException * @throws ClassNotFoundException */ public static Usecase parseSingle(final InputStream in, final ClassLoader loader) throws XMLStreamException, ClassNotFoundException { final XMLInputFactory factory = XMLInputFactory.newInstance(); factory.setProperty("javax.xml.stream.isCoalescing", true); // not supported by the reference implementation // factory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.TRUE); final XMLStreamReader parser = factory.createXMLStreamReader(in); while (parser.hasNext()) { final int event = parser.next(); if (event == XMLStreamConstants.START_ELEMENT && parser.getName().getLocalPart().equals("usecase")) { return parseUsecase(parser, loader); } } throw new IllegalArgumentException("No usecases section found. [" + parser.getLocation() + "]"); }
From source file:org.activiti.bpmn.converter.BpmnXMLConverter.java
public BpmnModel convertToBpmnModel(InputStreamProvider inputStreamProvider, boolean validateSchema, boolean enableSafeBpmnXml, String encoding) { XMLInputFactory xif = XMLInputFactory.newInstance(); if (xif.isPropertySupported(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES)) { xif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, false); }/*from www . j ava2 s.com*/ if (xif.isPropertySupported(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES)) { xif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); } if (xif.isPropertySupported(XMLInputFactory.SUPPORT_DTD)) { xif.setProperty(XMLInputFactory.SUPPORT_DTD, false); } InputStreamReader in = null; try { in = new InputStreamReader(inputStreamProvider.getInputStream(), encoding); XMLStreamReader xtr = xif.createXMLStreamReader(in); try { if (validateSchema) { if (!enableSafeBpmnXml) { validateModel(inputStreamProvider); } else { validateModel(xtr); } // The input stream is closed after schema validation in = new InputStreamReader(inputStreamProvider.getInputStream(), encoding); xtr = xif.createXMLStreamReader(in); } } catch (Exception e) { throw new RuntimeException("Could not validate XML with BPMN 2.0 XSD", e); } // XML conversion return convertToBpmnModel(xtr); } catch (UnsupportedEncodingException e) { throw new RuntimeException("The bpmn 2.0 xml is not UTF8 encoded", e); } catch (XMLStreamException e) { throw new RuntimeException("Error while reading the BPMN 2.0 XML", e); } finally { if (in != null) { try { in.close(); } catch (IOException e) { LOGGER.debug("Problem closing BPMN input stream", e); } } } }