List of usage examples for java.lang Character BYTES
int BYTES
To view the source code for java.lang Character BYTES.
Click Source Link
From source file:net.yacy.document.parser.GenericXMLParser.java
@Override public Document[] parse(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Failure { /* Limit the size of the in-memory buffer to at most 25% of the available memory : * because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array. * Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */ final long availableMemory = MemoryControl.available(); final long maxBytes = (long) (availableMemory * 0.25); final int maxChars; if ((maxBytes / Character.BYTES) > Integer.MAX_VALUE) { maxChars = Integer.MAX_VALUE; } else {//w ww .j a v a2 s. com maxChars = ((int) maxBytes) / Character.BYTES; } try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer( maxChars);) { /* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration * (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters), */ final XmlStreamReader reader = new XmlStreamReader(source, mimeType, true, charset); final InputSource saxSource = new InputSource(reader); final String detectedCharset = reader.getEncoding(); final List<AnchorURL> detectedURLs = new ArrayList<>(); final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs); final SAXParser saxParser = getParser(); saxParser.parse(saxSource, saxHandler); if (writer.isOverflow()) { throw new Parser.Failure("Not enough Memory available for generic the XML parser : " + Formatter.bytesToString(availableMemory), location); } /* create the parsed document */ Document[] docs = null; final byte[] contentBytes = UTF8.getBytes(writer.toString()); docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "", null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) }; return docs; } catch (Parser.Failure e) { throw e; } catch (final Exception e) { throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location); } }
From source file:net.yacy.document.parser.GenericXMLParser.java
/** * {@inheritDoc}/*from w w w. j a v a 2s . co m*/ * @param maxBytes the maximum number of content bytes to process. Be careful with to small values : * a Failure exception can eventually be thrown when maxBytes value is so small that the parser can even not fill its buffers on input stream and parse the document declaration. */ @Override public Document[] parseWithLimits(DigestURL location, String mimeType, String charsetName, VocabularyScraper scraper, int timezoneOffset, InputStream source, int maxLinks, long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException { /* Limit the size of the in-memory buffer to at most 25% of the available memory : * because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array. * Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */ final long availableMemory = MemoryControl.available(); final long maxTextBytes = (long) (availableMemory * 0.25); final int maxChars; if ((maxTextBytes / Character.BYTES) > Integer.MAX_VALUE) { maxChars = Integer.MAX_VALUE; } else { maxChars = ((int) maxTextBytes) / Character.BYTES; } try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer( maxChars);) { final Set<AnchorURL> detectedURLs = new HashSet<>(); final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs, maxLinks); StrictLimitInputStream limitedSource = new StrictLimitInputStream(source, maxBytes); /* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration * (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters), */ final XmlStreamReader reader = new XmlStreamReader(limitedSource, mimeType, true, charsetName); final InputSource saxSource = new InputSource(reader); final String detectedCharset = reader.getEncoding(); final SAXParser saxParser = getParser(); boolean limitExceeded = false; try { saxParser.parse(saxSource, saxHandler); } catch (SAXException e) { if (!(e.getCause() instanceof SizeLimitExceededException)) { /* Only transmit to upper layer exceptions that are not caused by the maxLinks limit being reached */ throw e; } limitExceeded = true; } catch (StreamLimitException e) { limitExceeded = true; } if (writer.isOverflow()) { throw new Parser.Failure("Not enough Memory available for generic the XML parser : " + Formatter.bytesToString(availableMemory), location); } /* Create the parsed document with eventually only partial part of the text and links */ final byte[] contentBytes = UTF8.getBytes(writer.toString()); Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "", null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) }; docs[0].setPartiallyParsed(limitExceeded); return docs; } catch (final Exception e) { throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location); } }