List of usage examples for java.lang NoSuchFieldError getMessage
public String getMessage()
From source file:uk.bl.wa.analyser.payload.TikaPayloadAnalyser.java
/** * @param source the source of the input stream - typically a WARC file. Used for error logging. * @param solr /*from w ww . j a v a 2 s .c o m*/ * @param is content to analyse. * @param url optional URL for the bytes in is. * @return * @throws IOException */ @SuppressWarnings("deprecation") public SolrRecord extract(String source, SolrRecord solr, InputStream is, String url) throws IOException { // Set up the TikaInputStream: TikaInputStream tikainput = null; if (this.maxBytesToParser > 0) { tikainput = TikaInputStream .get(new BoundedInputStream(new CloseShieldInputStream(is), maxBytesToParser)); } else { tikainput = TikaInputStream.get(new CloseShieldInputStream(is)); } // Also pass URL as metadata to allow extension hints to work: Metadata metadata = new Metadata(); if (url != null) metadata.set(Metadata.RESOURCE_NAME_KEY, url); final long detectStart = System.nanoTime(); StringBuilder detected = new StringBuilder(); try { DetectRunner detect = new DetectRunner(source, tika, tikainput, detected, metadata); TimeLimiter.run(detect, 10000L, false); } catch (NoSuchFieldError e) { // TODO Is this an Apache POI version issue? log.error("Tika.detect(): " + e.getMessage() + " for " + url + " in " + source); addExceptionMetadata(metadata, new Exception("detect threw " + e.getClass().getCanonicalName())); } catch (Exception e) { log.error("Tika.detect(): " + e.getMessage() + " for " + url + " in " + source); addExceptionMetadata(metadata, e); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#detect", detectStart); // Only proceed if we have a suitable type: if (!this.checkMime(detected.toString())) { if ("".equals(detected.toString())) { solr.addField(SolrFields.SOLR_CONTENT_TYPE, MediaType.APPLICATION_OCTET_STREAM.toString()); } else { solr.addField(SolrFields.SOLR_CONTENT_TYPE, detected.toString()); } return solr; } // Context ParseContext context = new ParseContext(); StringWriter content = new StringWriter(); // Override the recursive parsing: if (embedded == null) embedded = new NonRecursiveEmbeddedDocumentExtractor(context); context.set(EmbeddedDocumentExtractor.class, embedded); try { final long parseStart = System.nanoTime(); ParseRunner runner = new ParseRunner(source, tika.getParser(), tikainput, this.getHandler(content), metadata, context); try { TimeLimiter.run(runner, parseTimeout, true); } catch (OutOfMemoryError o) { log.error("TikaExtractor.parse() - OutOfMemoryError: " + o.getMessage() + " for " + url + " in " + source); addExceptionMetadata(metadata, new Exception("OutOfMemoryError")); } catch (RuntimeException r) { log.error("TikaExtractor.parse() - RuntimeException: " + r.getMessage() + " for " + url + " in " + source); addExceptionMetadata(metadata, r); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#parse", parseStart); // If there was a parse error, report it: String tikaException = metadata.get(TikaPayloadAnalyser.TIKA_PARSE_EXCEPTION); if (tikaException != null) { solr.addParseException(tikaException, new RuntimeException("Exception from Tika")); } final long extractStart = System.nanoTime(); // Copy the body text, forcing a UTF-8 encoding: String output = new String(content.toString().getBytes("UTF-8")); if (runner.complete || !output.equals("")) { if (output.length() > this.max_text_length) { output = output.substring(0, this.max_text_length); } log.debug("Extracted text from: " + url + " in " + source); log.debug("Extracted text: " + StringUtils.left(output, 300)); solr.setField(SolrFields.SOLR_EXTRACTED_TEXT, output); solr.setField(SolrFields.SOLR_EXTRACTED_TEXT_LENGTH, Integer.toString(output.length())); } else { //log.debug("Failed to extract any text from: "+url); } // Noisily report all metadata properties: /* * for( String m : metadata.names() ) { * log.info("For "+url.substring(url.length() - (int) * Math.pow(url.length(),0.85))+": "+m+" -> "+metadata.get(m)); } */ // Attempt to record all metadata discovered: if (this.extractAllMetadata) { for (String m : metadata.names()) { // Ignore these as they are not very interesting: if (Metadata.RESOURCE_NAME_KEY.equalsIgnoreCase(m) || "dc:title".equalsIgnoreCase(m) || "title".equalsIgnoreCase(m) || "description".equalsIgnoreCase(m) || "keywords".equalsIgnoreCase(m) || Metadata.CONTENT_ENCODING.equalsIgnoreCase(m) || Metadata.CONTENT_LOCATION.equalsIgnoreCase(m) || "ACTINICTITLE".equalsIgnoreCase(m) || Metadata.CONTENT_TYPE.equalsIgnoreCase(m)) { continue; } // Record in the document, but trim big ones: String value = metadata.get(m); if (value != null && value.length() > 100) { value = value.substring(0, 100); } solr.addField(SolrFields.SOLR_TIKA_METADATA, m + "=" + value); } } // Also Pick out particular metadata: String contentType = metadata.get(Metadata.CONTENT_TYPE); solr.addField(SolrFields.SOLR_CONTENT_TYPE, contentType); solr.addField(SolrFields.SOLR_TITLE, metadata.get(DublinCore.TITLE)); solr.addField(SolrFields.SOLR_DESCRIPTION, metadata.get(DublinCore.DESCRIPTION)); solr.addField(SolrFields.SOLR_KEYWORDS, metadata.get("keywords")); solr.addField(SolrFields.SOLR_AUTHOR, metadata.get(DublinCore.CREATOR)); solr.addField(SolrFields.CONTENT_ENCODING, metadata.get(Metadata.CONTENT_ENCODING)); // Parse out any embedded date that can act as a created/modified date. // I was not able find a single example where both created and modified where defined and different. I single field is sufficient. String date = null; if (metadata.get(Metadata.CREATION_DATE) != null) date = metadata.get(Metadata.CREATION_DATE); if (metadata.get(Metadata.DATE) != null) date = metadata.get(Metadata.DATE); if (metadata.get(Metadata.MODIFIED) != null) date = metadata.get(Metadata.MODIFIED); if (date != null) { DateTimeFormatter df = ISODateTimeFormat.dateTimeParser(); DateTime edate = null; try { edate = df.parseDateTime(date); } catch (IllegalArgumentException e) { log.error("Could not parse date: " + date + " from URL " + url + " in " + source); } if (edate == null) { Date javadate = Times.extractDate(date); if (javadate != null) edate = new org.joda.time.DateTime(javadate); } if (edate != null) { solr.addField(SolrFields.LAST_MODIFIED_YEAR, "" + edate.getYear()); DateTimeFormatter iso_df = ISODateTimeFormat.dateTimeNoMillis().withZone(DateTimeZone.UTC); // solr.getSolrDocument().setField(SolrFields.LAST_MODIFIED, // edate); solr.setField(SolrFields.LAST_MODIFIED, iso_df.print(edate)); } } // Also look to record the software identifiers: // Look for generic xmp:CreatorTool solr.addField(SolrFields.GENERATOR, metadata.get("xmp:CreatorTool")); // For PDF, support other metadata tags: //solr.addField(SolrFields.GENERATOR, metadata.get( "creator" )); // This appears to be dc:creator i.e. author. solr.addField(SolrFields.GENERATOR, metadata.get("producer")); solr.addField(SolrFields.GENERATOR, metadata.get(Metadata.SOFTWARE)); solr.addField(SolrFields.GENERATOR, metadata.get("software")); solr.addField(SolrFields.GENERATOR, metadata.get("Software")); solr.addField(SolrFields.GENERATOR, metadata.get("generator")); solr.addField(SolrFields.GENERATOR, metadata.get("Generator")); solr.addField(SolrFields.GENERATOR, metadata.get("ProgId")); //handle image EXIF metaformat String exifVersion = metadata.get("Exif Version"); if (exifVersion != null) { solr.addField(SolrFields.EXIF_VERSION, exifVersion); String exif_artist = metadata.get("Artist"); if (exif_artist != null) { // This is a better value for the author field // This potentially results in multiple author, which is valid solr.addField(SolrFields.SOLR_AUTHOR, exif_artist); } if (this.extractExifLocation) { String exif_latitude = metadata.get("GPS Latitude"); String exif_longitude = metadata.get("GPS Longitude"); if (exif_latitude != null && exif_longitude != null) { double latitude = DMS2DG(exif_latitude); double longitude = DMS2DG(exif_longitude); try { if (latitude != 0d && longitude != 0d) { // Sometimes they are defined but both 0 if (latitude <= 90 && latitude >= -90 && longitude <= 180 && longitude >= -180) { solr.addField(SolrFields.EXIF_LOCATION, latitude + "," + longitude); } else { log.warn( "invalid gsp exif information:" + exif_latitude + "," + exif_longitude); } } } catch (Exception e) { //Just ignore. No GPS data added to solr log.warn("error parsing exif gps data. latitude:" + exif_latitude + " longitude:" + exif_longitude); } } } } //End image exif metadata // Application ID, MS Office only AFAICT, and the VERSION is only doc String software = null; if (metadata.get(Metadata.APPLICATION_NAME) != null) software = metadata.get(Metadata.APPLICATION_NAME); if (metadata.get(Metadata.APPLICATION_VERSION) != null) software += " " + metadata.get(Metadata.APPLICATION_VERSION); // Images, e.g. JPEG and TIFF, can have 'Software', 'tiff:Software', // PNGs have a 'tEXt tEXtEntry: keyword=Software, value=GPL Ghostscript 8.71' String png_textentry = metadata.get("tEXt tEXtEntry"); if (png_textentry != null && png_textentry.contains("keyword=Software, value=")) software = png_textentry.replace("keyword=Software, value=", ""); /* Some JPEGs have this: Jpeg Comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality */ if (software != null) { solr.addField(SolrFields.GENERATOR, software); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#extract", extractStart); } catch (Exception e) { log.error("TikaExtractor.extract(): " + e.getMessage() + " for URL " + url + " in " + source); } // TODO: This should probably be wrapped in a method-spanning try-finally to guarantee close if (tikainput != null) { try { tikainput.close(); } catch (IOException e) { log.warn("Exception closing TikaInputStream. This leaves tmp-files: " + e.getMessage() + " for " + url + " in " + source); } } return solr; }
From source file:uk.bl.wa.solr.TikaExtractor.java
/** * /*from ww w. ja v a 2 s . co m*/ * @param solr * @param is * @param url * @return * @throws IOException */ @SuppressWarnings("deprecation") public SolrRecord extract(SolrRecord solr, InputStream is, String url) throws IOException { // Set up the TikaInputStream: TikaInputStream tikainput = null; if (this.maxBytesToParser > 0) { tikainput = TikaInputStream .get(new BoundedInputStream(new CloseShieldInputStream(is), maxBytesToParser)); } else { tikainput = TikaInputStream.get(new CloseShieldInputStream(is)); } // Also pass URL as metadata to allow extension hints to work: Metadata metadata = new Metadata(); if (url != null) metadata.set(Metadata.RESOURCE_NAME_KEY, url); final long detectStart = System.nanoTime(); StringBuilder detected = new StringBuilder(); try { DetectRunner detect = new DetectRunner(tika, tikainput, detected, metadata); Thread detectThread = new Thread(detect, Long.toString(System.currentTimeMillis())); detectThread.start(); detectThread.join(10000L); detectThread.interrupt(); } catch (NoSuchFieldError e) { // TODO Is this an Apache POI version issue? log.error("Tika.detect(): " + e.getMessage()); addExceptionMetadata(metadata, new Exception("detect threw " + e.getClass().getCanonicalName())); } catch (Exception e) { log.error("Tika.detect(): " + e.getMessage()); addExceptionMetadata(metadata, e); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#detect", detectStart); // Only proceed if we have a suitable type: if (!this.checkMime(detected.toString())) { if ("".equals(detected.toString())) { solr.addField(SolrFields.SOLR_CONTENT_TYPE, MediaType.APPLICATION_OCTET_STREAM.toString()); } else { solr.addField(SolrFields.SOLR_CONTENT_TYPE, detected.toString()); } return solr; } // Context ParseContext context = new ParseContext(); StringWriter content = new StringWriter(); // Override the recursive parsing: if (embedded == null) embedded = new NonRecursiveEmbeddedDocumentExtractor(context); context.set(EmbeddedDocumentExtractor.class, embedded); try { final long parseStart = System.nanoTime(); ParseRunner runner = new ParseRunner(tika.getParser(), tikainput, this.getHandler(content), metadata, context); Thread parseThread = new Thread(runner, Long.toString(System.currentTimeMillis())); try { parseThread.start(); parseThread.join(this.parseTimeout); parseThread.interrupt(); parseThread.join(this.parseTimeout); } catch (OutOfMemoryError o) { log.error("TikaExtractor.parse() - OutOfMemoryError: " + o.getMessage()); addExceptionMetadata(metadata, new Exception("OutOfMemoryError")); } catch (RuntimeException r) { log.error("TikaExtractor.parse() - RuntimeException: " + r.getMessage()); addExceptionMetadata(metadata, r); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#parse", parseStart); // If there was a parse error, report it: solr.addField(SolrFields.PARSE_ERROR, metadata.get(TikaExtractor.TIKA_PARSE_EXCEPTION)); final long extractStart = System.nanoTime(); // Copy the body text, forcing a UTF-8 encoding: String output = new String(content.toString().getBytes("UTF-8")); if (runner.complete || !output.equals("")) { if (output.length() > this.max_text_length) { output = output.substring(0, this.max_text_length); } log.debug("Extracted text from: " + url); log.debug("Extracted text: " + StringUtils.left(output, 300)); solr.setField(SolrFields.SOLR_EXTRACTED_TEXT, output); solr.setField(SolrFields.SOLR_EXTRACTED_TEXT_LENGTH, Integer.toString(output.length())); } else { //log.debug("Failed to extract any text from: "+url); } // Noisily report all metadata properties: /* * for( String m : metadata.names() ) { * log.info("For "+url.substring(url.length() - (int) * Math.pow(url.length(),0.85))+": "+m+" -> "+metadata.get(m)); } */ // Attempt to record all metadata discovered: if (this.extractAllMetadata) { for (String m : metadata.names()) { // Ignore these as they are not very interesting: if (Metadata.RESOURCE_NAME_KEY.equalsIgnoreCase(m) || "dc:title".equalsIgnoreCase(m) || "title".equalsIgnoreCase(m) || "description".equalsIgnoreCase(m) || "keywords".equalsIgnoreCase(m) || Metadata.CONTENT_ENCODING.equalsIgnoreCase(m) || Metadata.CONTENT_LOCATION.equalsIgnoreCase(m) || "ACTINICTITLE".equalsIgnoreCase(m) || Metadata.CONTENT_TYPE.equalsIgnoreCase(m)) { continue; } // Record in the document, but trim big ones: String value = metadata.get(m); if (value != null && value.length() > 100) { value = value.substring(0, 100); } solr.addField(SolrFields.SOLR_TIKA_METADATA, m + "=" + value); } } // Also Pick out particular metadata: String contentType = metadata.get(Metadata.CONTENT_TYPE); solr.addField(SolrFields.SOLR_CONTENT_TYPE, contentType); solr.addField(SolrFields.SOLR_TITLE, metadata.get(DublinCore.TITLE)); solr.addField(SolrFields.SOLR_DESCRIPTION, metadata.get(DublinCore.DESCRIPTION)); solr.addField(SolrFields.SOLR_KEYWORDS, metadata.get("keywords")); solr.addField(SolrFields.SOLR_AUTHOR, metadata.get(DublinCore.CREATOR)); solr.addField(SolrFields.CONTENT_ENCODING, metadata.get(Metadata.CONTENT_ENCODING)); // Parse out any embedded date that can act as a created/modified date. String date = null; if (metadata.get(Metadata.CREATION_DATE) != null) date = metadata.get(Metadata.CREATION_DATE); if (metadata.get(Metadata.DATE) != null) date = metadata.get(Metadata.DATE); if (metadata.get(Metadata.MODIFIED) != null) date = metadata.get(Metadata.MODIFIED); if (date != null) { DateTimeFormatter df = ISODateTimeFormat.dateTimeParser(); DateTime edate = null; try { edate = df.parseDateTime(date); } catch (IllegalArgumentException e) { log.error("Could not parse: " + date); } if (edate == null) { Date javadate = Times.extractDate(date); if (javadate != null) edate = new org.joda.time.DateTime(javadate); } if (edate != null) { solr.addField(SolrFields.LAST_MODIFIED_YEAR, "" + edate.getYear()); DateTimeFormatter iso_df = ISODateTimeFormat.dateTimeNoMillis().withZone(DateTimeZone.UTC); // solr.getSolrDocument().setField(SolrFields.LAST_MODIFIED, // edate); solr.setField(SolrFields.LAST_MODIFIED, iso_df.print(edate)); } } // Also look to record the software identifiers: // Look for generic xmp:CreatorTool solr.addField(SolrFields.GENERATOR, metadata.get("xmp:CreatorTool")); // For PDF, support other metadata tags: //solr.addField(SolrFields.GENERATOR, metadata.get( "creator" )); // This appears to be dc:creator i.e. author. solr.addField(SolrFields.GENERATOR, metadata.get("producer")); solr.addField(SolrFields.GENERATOR, metadata.get(Metadata.SOFTWARE)); solr.addField(SolrFields.GENERATOR, metadata.get("generator")); solr.addField(SolrFields.GENERATOR, metadata.get("Software")); // Application ID, MS Office only AFAICT, and the VERSION is only doc String software = null; if (metadata.get(Metadata.APPLICATION_NAME) != null) software = metadata.get(Metadata.APPLICATION_NAME); if (metadata.get(Metadata.APPLICATION_VERSION) != null) software += " " + metadata.get(Metadata.APPLICATION_VERSION); // Images, e.g. JPEG and TIFF, can have 'Software', 'tiff:Software', // PNGs have a 'tEXt tEXtEntry: keyword=Software, value=GPL Ghostscript 8.71' String png_textentry = metadata.get("tEXt tEXtEntry"); if (png_textentry != null && png_textentry.contains("keyword=Software, value=")) software = png_textentry.replace("keyword=Software, value=", ""); /* Some JPEGs have this: Jpeg Comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality */ if (software != null) { solr.addField(SolrFields.GENERATOR, software); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#extract", extractStart); } catch (Exception e) { log.error("TikaExtractor.extract(): " + e.getMessage()); } // TODO: This should probably be wrapped in a method-spanning try-finally to guarantee close if (tikainput != null) { try { tikainput.close(); } catch (IOException e) { log.warn("Exception closing TikaInputStream. This leaves tmp-files: " + e.getMessage()); } } return solr; }