List of usage examples for java.io InputStream mark
public synchronized void mark(int readlimit)
From source file:org.sakaiproject.tool.assessment.facade.AssessmentGradingFacadeQueries.java
private byte[] getMediaStream(Long mediaId) { byte[] b = new byte[4000]; Session session = null;//from w ww .j av a 2 s .com Connection conn = null; InputStream in = null; ResultSet rs = null; PreparedStatement statement = null; try { session = getSessionFactory().openSession(); conn = session.connection(); log.debug("****Connection=" + conn); String query = "select MEDIA from SAM_MEDIA_T where MEDIAID=?"; statement = conn.prepareStatement(query); statement.setLong(1, mediaId.longValue()); rs = statement.executeQuery(); if (rs.next()) { java.lang.Object o = rs.getObject("MEDIA"); if (o != null) { in = rs.getBinaryStream("MEDIA"); in.mark(0); int ch; int len = 0; while ((ch = in.read()) != -1) { len++; } b = new byte[len]; in.reset(); in.read(b, 0, len); } } } catch (Exception e) { log.warn(e.getMessage()); } finally { if (session != null) { try { session.close(); } catch (Exception e1) { e1.printStackTrace(); } } if (rs != null) { try { rs.close(); } catch (Exception e1) { e1.printStackTrace(); } } if (statement != null) { try { statement.close(); } catch (Exception e1) { e1.printStackTrace(); } } if (in != null) { try { in.close(); } catch (Exception e1) { e1.printStackTrace(); } } if (conn != null) { try { conn.close(); } catch (Exception e1) { e1.printStackTrace(); } } } return b; }
From source file:Main.java
public static String getStreamEncoding(InputStream stream) throws IOException { String encoding = null;/*from w w w . j a va2 s . co m*/ boolean DEBUG = false; if (DEBUG) { SortedMap map = Charset.availableCharsets(); Object[] keys = map.keySet().toArray(); for (int i = 0; i < keys.length; i++) { System.out.println("Key = " + keys[i] + " Value = " + map.get(keys[i])); } } int ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); // UCS-4 Big Endian (1234) if (ch == 0x00) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x00) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0xFE) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0xFF) { encoding = UCS_4BE; } } else if (ch == 0xFF) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0xFE) { encoding = UNKNOWN; } } else if (ch == 0x00) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x3C) { encoding = UCS_4BE; } } else if (ch == 0x3C) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x00) { encoding = UNKNOWN; } } } else if (ch == 0x3C) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x00) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x00) { encoding = UNKNOWN; } else if (ch == 0x3F) { encoding = UTF_16BE; } } } } else if (ch == 0x3C) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x00) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x00) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x00) { encoding = UCS_4LE; } } else if (ch == 0x3F) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x00) { encoding = UTF_16LE; } } } else if (ch == 0x3F) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x78) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x6D) { encoding = UTF_8; } } } } else if (ch == 0xFF) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0xFE) { ch = stream.read(); encoding = UTF_16LE; if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x00) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x00) { encoding = UCS_4LE; } } } } else if (ch == 0xFE) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0xFF) { ch = stream.read(); encoding = UTF_16BE; if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x00) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x00) { encoding = UNKNOWN; } } } } else if (ch == 0xEF) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0xBB) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0xBF) { // System.out.println( "Found UTF-8 byte order mark."); // strip utf-8 byte order mark stream.mark(1024); encoding = UTF_8; } } } else if (ch == 0x4C) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x6F) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0xA7) { ch = stream.read(); if (DEBUG) System.out.print("[" + ch + "]"); if (ch == 0x94) { encoding = EBCDIC; } } } } if (DEBUG) System.out.println("getStreamEncoding() [" + encoding + "]"); return encoding; }
From source file:uk.bl.wa.indexer.WARCIndexer.java
/** * This extracts metadata from the ArchiveRecord and creates a suitable SolrRecord. * Removes the text field if flag set./* ww w .j ava 2 s.co m*/ * * @param archiveName * @param record * @param isTextIncluded * @return * @throws IOException */ public SolrRecord extract(String archiveName, ArchiveRecord record, boolean isTextIncluded) throws IOException { final long start = System.nanoTime(); ArchiveRecordHeader header = record.getHeader(); SolrRecord solr = solrFactory.createRecord(archiveName, header); if (!header.getHeaderFields().isEmpty()) { if (header.getHeaderFieldKeys().contains(HEADER_KEY_TYPE)) { log.debug("Looking at " + header.getHeaderValue(HEADER_KEY_TYPE)); if (!checkRecordType((String) header.getHeaderValue(HEADER_KEY_TYPE))) { return null; } // Store WARC record type: solr.setField(SolrFields.SOLR_RECORD_TYPE, (String) header.getHeaderValue(HEADER_KEY_TYPE)); //Store WARC-Record-ID solr.setField(SolrFields.WARC_KEY_ID, (String) header.getHeaderValue(HEADER_KEY_ID)); solr.setField(SolrFields.WARC_IP, (String) header.getHeaderValue(HEADER_KEY_IP)); } else { // else we're processing ARCs so nothing to filter and no // revisits solr.setField(SolrFields.SOLR_RECORD_TYPE, "arc"); } if (header.getUrl() == null) return null; // Get the URL: String targetUrl = Normalisation.sanitiseWARCHeaderValue(header.getUrl()); // Strip down very long URLs to avoid // "org.apache.commons.httpclient.URIException: Created (escaped) // uuri > 2083" // Trac #2271: replace string-splitting with URI-based methods. if (targetUrl.length() > 2000) targetUrl = targetUrl.substring(0, 2000); log.debug( "Current heap usage: " + FileUtils.byteCountToDisplaySize(Runtime.getRuntime().totalMemory())); log.debug("Processing " + targetUrl + " from " + archiveName); // Check the filters: if (this.checkProtocol(targetUrl) == false) return null; if (this.checkUrl(targetUrl) == false) return null; if (this.checkExclusionFilter(targetUrl) == false) return null; // ----------------------------------------------------- // Add user supplied Archive-It Solr fields and values: // ----------------------------------------------------- solr.setField(SolrFields.INSTITUTION, WARCIndexerCommand.institution); solr.setField(SolrFields.COLLECTION, WARCIndexerCommand.collection); solr.setField(SolrFields.COLLECTION_ID, WARCIndexerCommand.collection_id); // --- Basic headers --- // Basic metadata: solr.setField(SolrFields.SOURCE_FILE, archiveName); solr.setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset()); String filePath = header.getReaderIdentifier();//Full path of file //Will convert windows path to linux path. Linux paths will not be modified. String linuxFilePath = FilenameUtils.separatorsToUnix(filePath); solr.setField(SolrFields.SOURCE_FILE_PATH, linuxFilePath); byte[] url_md5digest = md5 .digest(Normalisation.sanitiseWARCHeaderValue(header.getUrl()).getBytes("UTF-8")); // String url_base64 = // Base64.encodeBase64String(fullUrl.getBytes("UTF-8")); String url_md5hex = Base64.encodeBase64String(url_md5digest); solr.setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl())); if (addNormalisedURL) { solr.setField(SolrFields.SOLR_URL_NORMALISED, Normalisation.canonicaliseURL(targetUrl)); } // Get the length, but beware, this value also includes the HTTP headers (i.e. it is the payload_length): long content_length = header.getLength(); // Also pull out the file extension, if any: String resourceName = parseResourceName(targetUrl); solr.addField(SolrFields.RESOURCE_NAME, resourceName); solr.addField(SolrFields.CONTENT_TYPE_EXT, parseExtension(resourceName)); // Add URL-based fields: URI saneURI = parseURL(solr, targetUrl); // Prepare crawl date information: String waybackDate = (header.getDate().replaceAll("[^0-9]", "")); Date crawlDate = getWaybackDate(waybackDate); // Store the dates: solr.setField(SolrFields.CRAWL_DATE, formatter.format(crawlDate)); solr.setField(SolrFields.CRAWL_YEAR, getYearFromDate(crawlDate)); // Use the current value as the waybackDate: solr.setField(SolrFields.WAYBACK_DATE, waybackDate); Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#archeaders", start); // ----------------------------------------------------- // Now consume record and HTTP headers (only) // ----------------------------------------------------- InputStream tikainput = null; // Only parse HTTP headers for HTTP URIs if (targetUrl.startsWith("http")) { // Parse HTTP headers: String statusCode = null; if (record instanceof WARCRecord) { statusCode = this.processWARCHeaders(record, header, targetUrl, solr); tikainput = record; } else if (record instanceof ARCRecord) { ARCRecord arcr = (ARCRecord) record; statusCode = "" + arcr.getStatusCode(); this.processHeaders(solr, statusCode, arcr.getHttpHeaders(), targetUrl); arcr.skipHttpHeader(); tikainput = arcr; } else { log.error("FAIL! Unsupported archive record type."); return solr; } solr.setField(SolrFields.SOLR_STATUS_CODE, statusCode); // Skip recording non-content URLs (i.e. 2xx responses only please): if (!checkResponseCode(statusCode)) { log.debug("Skipping this record based on status code " + statusCode + ": " + targetUrl); return null; } } else { log.info("Skipping header parsing as URL does not start with 'http'"); } // ----------------------------------------------------- // Headers have been processed, payload ready to cache: // ----------------------------------------------------- // Update the content_length based on what's available: content_length = tikainput.available(); // Record the length: solr.setField(SolrFields.CONTENT_LENGTH, "" + content_length); // Create an appropriately cached version of the payload, to allow analysis. final long hashStreamStart = System.nanoTime(); HashedCachedInputStream hcis = new HashedCachedInputStream(header, tikainput, content_length); tikainput = hcis.getInputStream(); String hash = hcis.getHash(); Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#hashstreamwrap", hashStreamStart); // Use an ID that ensures every URL+timestamp gets a separate // record: String id = waybackDate + "/" + url_md5hex; // Set these last: solr.setField(SolrFields.ID, id); solr.setField(SolrFields.HASH, hash); // ----------------------------------------------------- // Apply any annotations: // ----------------------------------------------------- if (ant != null) { try { ant.applyAnnotations(saneURI, solr.getSolrDocument()); } catch (URISyntaxException e) { e.printStackTrace(); log.error("Failed to annotate " + saneURI + " : " + e); } } // ----------------------------------------------------- // WARC revisit record handling: // ----------------------------------------------------- // If this is a revisit record, we should just return an update to the crawl_dates (when using hashUrlId) if (WARCConstants.WARCRecordType.revisit.name() .equalsIgnoreCase((String) header.getHeaderValue(HEADER_KEY_TYPE))) { solr.removeField(SolrFields.CONTENT_LENGTH); //It is 0 and would mess with statistics //Copy content_type_served to content_type (no tika/droid for revisits) solr.addField(SolrFields.SOLR_CONTENT_TYPE, (String) solr.getFieldValue(SolrFields.CONTENT_TYPE_SERVED)); return solr; } // ----------------------------------------------------- // Payload duplication has been checked, ready to parse: // ----------------------------------------------------- final long analyzeStart = System.nanoTime(); // Mark the start of the payload, with a readLimit corresponding to // the payload size: tikainput.mark((int) content_length); // Pass on to other extractors as required, resetting the stream before each: this.wpa.analyse(archiveName, header, tikainput, solr, content_length); Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#analyzetikainput", analyzeStart); // Clear up the caching of the payload: hcis.cleanup(); // ----------------------------------------------------- // Payload analysis complete, now performing text analysis: // ----------------------------------------------------- this.txa.analyse(solr); // Remove the Text Field if required if (!isTextIncluded) { solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT); } else { // Otherwise, decide whether to store or both store and index // the text: if (storeText == false) { // Copy the text into the indexed (but not stored) field: solr.setField(SolrFields.SOLR_EXTRACTED_TEXT_NOT_STORED, (String) solr.getField(SolrFields.SOLR_EXTRACTED_TEXT).getFirstValue()); // Take the text out of the original (stored) field. solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT); } } } Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#solrdocCreation", "WARCIndexer.extract#total", start); String servedType = "" + solr.getField(SolrFields.CONTENT_TYPE_SERVED); Instrument.timeRel("WARCIndexer#content_types", "WARCIndexer#" + (servedType.contains(";") ? servedType.split(";")[0] : servedType), start); Instrument.timeRel("WARCIndexer#content_types", start); return solr; }
From source file:org.regenstrief.util.Util.java
/** * Retrieves an InputStream that can be marked and marks it * /*from w w w . ja va2s . c o m*/ * @param in the InputStream * @return the markable InputStream **/ public static final InputStream getMarkableInputStream(InputStream in) { in = in.markSupported() ? in : new BufferedInputStream(in); in.mark(MAX_MARK_BUFFER); return in; }
From source file:org.regenstrief.util.Util.java
/** * Retrieves an InputStream that can be marked and marks it * //w w w. j a v a2 s . com * @param in the InputStream * @param buffer the buffer size * @return the markable InputStream **/ public static final InputStream getMarkableInputStream(InputStream in, final int buffer) { in = in.markSupported() ? in : new BufferedInputStream(in); in.mark(buffer); return in; }