List of usage examples for java.util.concurrent ArrayBlockingQueue ArrayBlockingQueue
public ArrayBlockingQueue(int capacity)
From source file:com.fluidops.iwb.api.ProviderServiceImpl.java
Timer scheduleProviders() { TimerTask timerTask = new TimerTask() { Queue<AbstractFlexProvider> queue = new ArrayBlockingQueue<AbstractFlexProvider>(10000); @Override//ww w . java 2s . c o m public void run() { try { if (queue.isEmpty()) { // add any work for (AbstractFlexProvider s : getProviders()) { if (s.pollInterval <= 0) continue; // disabled if (s.running != null && s.running == true) continue; if (s instanceof ExternalProvider) continue; if (s instanceof LookupProvider) continue; if (s.lastUpdate == null) queue.add(s); else if (s.lastUpdate.getTime() + s.pollInterval < System.currentTimeMillis()) queue.add(s); } } if (!queue.isEmpty()) { AbstractFlexProvider provider = queue.poll(); runProvider(provider.providerID, null); } } catch (Exception e) { logger.error(e.getMessage(), e); } } }; Timer timer = new Timer("IWB Provider Update"); timer.schedule(timerTask, 1000, 1000); TimerRegistry.getInstance().registerProviderServiceTimer(timer); return timer; }
From source file:ca.gnewton.lusql.core.LuSql.java
void initThreadPoolExecutor() { if (numThreads < minThreadPoolThreads) minThreadPoolThreads = numThreads; recordQueue = new ArrayBlockingQueue<Runnable>(makeQueueSize()); //threadPoolExecutor = new ThreadPoolExecutor(2, if (threadPoolExecutor == null) threadPoolExecutor = new AddDocumentExecutor(minThreadPoolThreads, numThreads, 16l, TimeUnit.SECONDS, recordQueue, new ThreadPoolExecutor.CallerRunsPolicy(), this); }
From source file:org.kchine.r.server.RListener.java
public static String[] clusterApply(final String cl, final String varName, final String functionName, final String ato, final String asynch) { new Thread(new Runnable() { public void run() { try { Cluster cluster = _clustersHash.get(cl); if (cluster == null) { new Thread(new Runnable() { public void run() { try { DirectJNI.getInstance().getRServices() .consoleSubmit(convertToPrintCommand("Invalid cluster")); } catch (Exception e) { e.printStackTrace(); }//from www . java2 s.c om } }).start(); } RObject v = DirectJNI.getInstance().getRServices().getObject(varName); RObject vtemp = null; if (v.getClass() == RMatrix.class) { vtemp = ((RMatrix) v).getValue(); } else if (v.getClass() == RArray.class) { vtemp = ((RArray) v).getValue(); } else { vtemp = v; } final RObject var = vtemp; final VWrapper vwrapper = new VWrapper() { public int getSize() { if (var.getClass() == RNumeric.class) { return ((RNumeric) var).getValue().length; } else if (var.getClass() == RInteger.class) { return ((RInteger) var).getValue().length; } else if (var.getClass() == RChar.class) { return ((RChar) var).getValue().length; } else if (var.getClass() == RLogical.class) { return ((RLogical) var).getValue().length; } else if (var.getClass() == RComplex.class) { return ((RComplex) var).getReal().length; } else if (var.getClass() == RList.class) { return ((RList) var).getValue().length; } return 0; } public RObject getElementAt(int i) { if (var.getClass() == RNumeric.class) { return new RNumeric(((RNumeric) var).getValue()[i]); } else if (var.getClass() == RInteger.class) { return new RInteger(((RInteger) var).getValue()[i]); } else if (var.getClass() == RChar.class) { return new RChar(((RChar) var).getValue()[i]); } else if (var.getClass() == RLogical.class) { return new RLogical(((RLogical) var).getValue()[i]); } else if (var.getClass() == RComplex.class) { return new RComplex(new double[] { ((RComplex) var).getReal()[i] }, new double[] { ((RComplex) var).getImaginary()[i] }, ((RComplex) var).getIndexNA() != null ? new int[] { ((RComplex) var).getIndexNA()[i] } : null, ((RComplex) var).getNames() != null ? new String[] { ((RComplex) var).getNames()[i] } : null); } else if (var.getClass() == RList.class) { return (RObject) ((RList) var).getValue()[i]; } return null; } public Object gatherResults(RObject[] f) { if (var.getClass() == RList.class) { return f; } else { Class<?> resultClass = f[0].getClass(); RObject result = null; if (resultClass == RNumeric.class) { double[] t = new double[f.length]; for (int i = 0; i < f.length; ++i) t[i] = ((RNumeric) f[i]).getValue()[0]; result = new RNumeric(t); } else if (resultClass == RInteger.class) { int[] t = new int[f.length]; for (int i = 0; i < f.length; ++i) t[i] = ((RInteger) f[i]).getValue()[0]; result = new RInteger(t); } else if (resultClass == RChar.class) { String[] t = new String[f.length]; for (int i = 0; i < f.length; ++i) t[i] = ((RChar) f[i]).getValue()[0]; result = new RChar(t); } else if (resultClass == RLogical.class) { boolean[] t = new boolean[f.length]; for (int i = 0; i < f.length; ++i) t[i] = ((RLogical) f[i]).getValue()[0]; result = new RLogical(t); } else if (resultClass == RComplex.class) { double[] real = new double[f.length]; double[] im = new double[f.length]; for (int i = 0; i < f.length; ++i) { real[i] = ((RComplex) f[i]).getReal()[0]; im[i] = ((RComplex) f[i]).getImaginary()[0]; } result = new RComplex(real, im, null, null); } else { throw new RuntimeException( "Can't Handle this result type :" + resultClass.getName()); } return result; } } }; if (vwrapper.getSize() == 0) { new Thread(new Runnable() { public void run() { try { DirectJNI.getInstance().getRServices() .consoleSubmit(convertToPrintCommand("0 elements in data")); } catch (Exception e) { e.printStackTrace(); } } }).start(); } Vector<RServices> workers = cluster.getWorkers(); final ArrayBlockingQueue<Integer> indexesQueue = new ArrayBlockingQueue<Integer>( vwrapper.getSize()); for (int i = 0; i < vwrapper.getSize(); ++i) indexesQueue.add(i); final ArrayBlockingQueue<RServices> workersQueue = new ArrayBlockingQueue<RServices>( workers.size()); for (int i = 0; i < workers.size(); ++i) workersQueue.add(workers.elementAt(i)); final RObject[] result = new RObject[vwrapper.getSize()]; for (int i = 0; i < workers.size(); ++i) { new Thread(new Runnable() { public void run() { RServices r = workersQueue.poll(); while (indexesQueue.size() > 0) { Integer idx = indexesQueue.poll(); if (idx != null) { try { result[idx] = r.call(functionName, vwrapper.getElementAt(idx)); } catch (Exception e) { e.printStackTrace(); result[idx] = nullObject; } } } } }).start(); } while (true) { int count = 0; for (int i = 0; i < result.length; ++i) if (result[i] != null) ++count; if (count == result.length) break; Thread.sleep(100); } Object reconstituedObject = vwrapper.gatherResults(result); if (v.getClass() == RMatrix.class) { ((RArray) v).setValue((RVector) reconstituedObject); } else if (v.getClass() == RArray.class) { ((RArray) v).setValue((RVector) reconstituedObject); } else if (v.getClass() == RList.class) { ((RList) v).setValue((RObject[]) reconstituedObject); } else { v = (RObject) reconstituedObject; } final RObject final_v = v; new Thread(new Runnable() { public void run() { try { DirectJNI.getInstance().getRServices().putAndAssign(final_v, (ato.equals("") ? functionName + "_" + varName : ato)); DirectJNI.getInstance().getRServices().consoleSubmit( convertToPrintCommand("Cluster Apply result assigned to R variable " + (ato.equals("") ? functionName + "_" + varName : ato) + "\n")); } catch (Exception e) { e.printStackTrace(); } } }).start(); } catch (Exception e) { e.printStackTrace(); } } }).start(); return new String[] { "OK", convertToPrintCommand("Cluster Apply Submitted in background..") }; }
From source file:com.koda.integ.hbase.storage.FileExtStorage.java
/** * Get existing file./*from ww w . j a v a2s. c o m*/ * * @param id the id * @return file */ public RandomAccessFile getFile(int id) { if (existedIds.containsKey((long) id) == false) { return null; } Queue<RandomAccessFile> fileReaders = readers.get(id); if (fileReaders == null) { if (existedIds.containsKey((long) id) == false) { return null; } fileReaders = new ArrayBlockingQueue<RandomAccessFile>(maxOpenFD); readers.putIfAbsent(id, fileReaders); } fileReaders = readers.get(id); if (fileReaders == null) { return null; } RandomAccessFile raf = fileReaders.poll(); if (raf == null) { raf = openFile(id, "r"); } return raf; }
From source file:com.dumontierlab.pdb2rdf.Pdb2Rdf.java
private static ExecutorService getThreadPool(CommandLine cmd) { // twice the number of PU final Object monitor = new Object(); int numberOfThreads = getNumberOfThreads(cmd); LOG.info("Using " + numberOfThreads + " threads."); ThreadPoolExecutor threadPool = new ThreadPoolExecutor(numberOfThreads, numberOfThreads, 10, TimeUnit.MINUTES, new ArrayBlockingQueue<Runnable>(1), new RejectedExecutionHandler() { @Override// w ww . ja v a 2 s . com public void rejectedExecution(Runnable r, ThreadPoolExecutor executor) { synchronized (monitor) { try { monitor.wait(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } executor.execute(r); } }) { @Override protected void afterExecute(Runnable r, Throwable t) { synchronized (monitor) { monitor.notify(); } super.afterExecute(r, t); } }; return threadPool; }
From source file:org.apache.hadoop.net.unix.TestDomainSocket.java
/** * Test file descriptor passing.//from w ww .j a va 2 s . com * * @throws IOException */ @Test(timeout = 180000) public void testFdPassing() throws Exception { final String TEST_PATH = new File(sockDir.getDir(), "test_sock").getAbsolutePath(); final byte clientMsg1[] = new byte[] { 0x11, 0x22, 0x33, 0x44, 0x55, 0x66 }; final byte serverMsg1[] = new byte[] { 0x31, 0x30, 0x32, 0x34, 0x31, 0x33, 0x44, 0x1, 0x1, 0x1, 0x1, 0x1 }; final ArrayBlockingQueue<Throwable> threadResults = new ArrayBlockingQueue<Throwable>(2); final DomainSocket serv = DomainSocket.bindAndListen(TEST_PATH); final PassedFile passedFiles[] = new PassedFile[] { new PassedFile(1), new PassedFile(2) }; final FileDescriptor passedFds[] = new FileDescriptor[passedFiles.length]; for (int i = 0; i < passedFiles.length; i++) { passedFds[i] = passedFiles[i].getInputStream().getFD(); } Thread serverThread = new Thread() { public void run() { // Run server DomainSocket conn = null; try { conn = serv.accept(); byte in1[] = new byte[clientMsg1.length]; InputStream connInputStream = conn.getInputStream(); IOUtils.readFully(connInputStream, in1, 0, in1.length); Assert.assertTrue(Arrays.equals(clientMsg1, in1)); DomainSocket domainConn = (DomainSocket) conn; domainConn.sendFileDescriptors(passedFds, serverMsg1, 0, serverMsg1.length); conn.close(); } catch (Throwable e) { threadResults.add(e); Assert.fail(e.getMessage()); } threadResults.add(new Success()); } }; serverThread.start(); Thread clientThread = new Thread() { public void run() { try { DomainSocket client = DomainSocket.connect(TEST_PATH); OutputStream clientOutputStream = client.getOutputStream(); InputStream clientInputStream = client.getInputStream(); clientOutputStream.write(clientMsg1); DomainSocket domainConn = (DomainSocket) client; byte in1[] = new byte[serverMsg1.length]; FileInputStream recvFis[] = new FileInputStream[passedFds.length]; int r = domainConn.recvFileInputStreams(recvFis, in1, 0, in1.length - 1); Assert.assertTrue(r > 0); IOUtils.readFully(clientInputStream, in1, r, in1.length - r); Assert.assertTrue(Arrays.equals(serverMsg1, in1)); for (int i = 0; i < passedFds.length; i++) { Assert.assertNotNull(recvFis[i]); passedFiles[i].checkInputStream(recvFis[i]); } for (FileInputStream fis : recvFis) { fis.close(); } client.close(); } catch (Throwable e) { threadResults.add(e); } threadResults.add(new Success()); } }; clientThread.start(); for (int i = 0; i < 2; i++) { Throwable t = threadResults.take(); if (!(t instanceof Success)) { Assert.fail(t.getMessage() + ExceptionUtils.getStackTrace(t)); } } serverThread.join(120000); clientThread.join(120000); serv.close(); for (PassedFile pf : passedFiles) { pf.cleanup(); } }
From source file:org.apache.hadoop.hdfs.server.datanode.IABlockSender.java
/** * sendBlock() is used to read (and encode) block and its metadata and stream the data to * either a client or to another datanode * //www . ja v a2s .c o m * @param out stream to which the block is written to * @param baseStream optional. if non-null, <code>out</code> is assumed to * be a wrapper over this stream. This enables optimizations for * sending the data, e.g. * {@link SocketOutputStream#transferToFully(FileChannel, * long, int)}. * @param throttler for sending data. * @return total bytes reads, including crc. */ long sendBlock(DataOutputStream out, OutputStream baseStream, DataTransferThrottler throttler) throws IOException { if (out == null) { throw new IOException("out stream is null"); } this.throttler = throttler; if (throttler == null) LOG.info("throttler is null"); else LOG.info("throttler bandwidth: " + throttler.getBandwidth()); long initialOffset = offset; long totalRead = 0; OutputStream streamForSendChunks = out; final long startTime = ClientTraceLog.isInfoEnabled() ? System.nanoTime() : 0; try { try { checksum.writeHeader(out); if (chunkOffsetOK) { out.writeLong(offset); } out.flush(); } catch (IOException e) { //socket error throw ioeToSocketException(e); } int maxChunksPerPacket = 1; int pktSize = PacketHeader.PKT_HEADER_LEN; if (transferToAllowed && !verifyChecksum && baseStream instanceof SocketOutputStream && blockIn instanceof FileInputStream) { //FileChannel fileChannel = ((FileInputStream)blockIn).getChannel(); // blockInPosition also indicates sendChunks() uses transferTo. //blockInPosition = fileChannel.position(); //streamForSendChunks = baseStream; // assure a mininum buffer size. //maxChunksPerPacket = (Math.max(BUFFER_SIZE, // MIN_BUFFER_WITH_TRANSFERTO) // + bytesPerChecksum - 1)/bytesPerChecksum; // allocate smaller buffer while using transferTo(). //pktSize += checksumSize * maxChunksPerPacket; } else { maxChunksPerPacket = Math.max(1, (BUFFER_SIZE + bytesPerChecksum - 1) / bytesPerChecksum); pktSize += ((bytesPerChecksum + checksumSize) * maxChunksPerPacket); } //queue for passing data from encode to output BlockingQueue<ByteBuffer> q = new ArrayBlockingQueue<ByteBuffer>(64); //Encode thread IAREncoder encoder = new IAREncoder(q); new Thread(encoder).start(); //LOG.info("before allocate buf, we have pktSize " + pktSize + " maxchunksperpacket " // + maxChunksPerPacket + " byteperchecksum " + bytesPerChecksum + " checksum size " + // checksumSize); ByteBuffer pktBuf = ByteBuffer.allocate(pktSize); //output, send chunks while (endOffset > offset) { long len = sendChunks(pktBuf, maxChunksPerPacket, streamForSendChunks, q); //LOG.info("Send chunks with len:"+len+" and seq:"+seqno); //LOG.info("sendChunks offset:"+offset+" endOffset:"+endOffset); offset += len; totalRead += len + ((len + bytesPerChecksum - 1) / bytesPerChecksum * checksumSize); seqno++; } try { // send an empty packet to mark the end of the block sendChunks(pktBuf, maxChunksPerPacket, streamForSendChunks, q); out.flush(); LOG.info("Send last Chunk"); } catch (IOException e) { //socket error LOG.info("IOException in sendChunks"); throw ioeToSocketException(e); } sentEntireByteRange = true; } finally { if (clientTraceFmt != null) { final long endTime = System.nanoTime(); ClientTraceLog.info(String.format(clientTraceFmt, totalRead, initialOffset, endTime - startTime)); } close(); } blockReadFully = initialOffset == 0 && offset >= replicaVisibleLength; return totalRead; }
From source file:org.apache.hadoop.hdfs.server.datanode.PMBlockSender.java
/** * sendBlock() is used to read (and encode) block and its metadata and stream the data to * either a client or to another datanode * /*from ww w. ja v a 2 s . c om*/ * @param out stream to which the block is written to * @param baseStream optional. if non-null, <code>out</code> is assumed to * be a wrapper over this stream. This enables optimizations for * sending the data, e.g. * {@link SocketOutputStream#transferToFully(FileChannel, * long, int)}. * @param throttler for sending data. * @return total bytes reads, including crc. */ long sendBlock(DataOutputStream out, OutputStream baseStream, DataTransferThrottler throttler) throws IOException { if (out == null) { throw new IOException("out stream is null"); } this.throttler = throttler; if (throttler == null) LOG.info("throttler is null"); else LOG.info("throttler bandwidth: " + throttler.getBandwidth()); long initialOffset = offset; long totalRead = 0; OutputStream streamForSendChunks = out; final long startTime = ClientTraceLog.isInfoEnabled() ? System.nanoTime() : 0; try { try { checksum.writeHeader(out); if (chunkOffsetOK) { out.writeLong(offset); } out.flush(); } catch (IOException e) { //socket error throw ioeToSocketException(e); } int maxChunksPerPacket = 1; int pktSize = PacketHeader.PKT_HEADER_LEN; if (transferToAllowed && !verifyChecksum && baseStream instanceof SocketOutputStream && blockIn instanceof FileInputStream) { //FileChannel fileChannel = ((FileInputStream)blockIn).getChannel(); // blockInPosition also indicates sendChunks() uses transferTo. //blockInPosition = fileChannel.position(); //streamForSendChunks = baseStream; // assure a mininum buffer size. //maxChunksPerPacket = (Math.max(BUFFER_SIZE, // MIN_BUFFER_WITH_TRANSFERTO) // + bytesPerChecksum - 1)/bytesPerChecksum; // allocate smaller buffer while using transferTo(). //pktSize += checksumSize * maxChunksPerPacket; } else { maxChunksPerPacket = Math.max(1, (BUFFER_SIZE + bytesPerChecksum - 1) / bytesPerChecksum); pktSize += ((bytesPerChecksum + checksumSize) * maxChunksPerPacket); } //queue for passing data from encode to output BlockingQueue<ByteBuffer> q = new ArrayBlockingQueue<ByteBuffer>(64); //Encode thread PMREncoder encoder = new PMREncoder(q); new Thread(encoder).start(); //LOG.info("before allocate buf, we have pktSize " + pktSize + " maxchunksperpacket " // + maxChunksPerPacket + " byteperchecksum " + bytesPerChecksum + " checksum size " + // checksumSize); ByteBuffer pktBuf = ByteBuffer.allocate(pktSize); //output, send chunks while (endOffset > offset) { long len = sendChunks(pktBuf, maxChunksPerPacket, streamForSendChunks, q); //LOG.info("Send chunks with len:"+len+" and seq:"+seqno); //LOG.info("sendChunks offset:"+offset+" endOffset:"+endOffset); offset += len; totalRead += len + ((len + bytesPerChecksum - 1) / bytesPerChecksum * checksumSize); seqno++; } try { // send an empty packet to mark the end of the block sendChunks(pktBuf, maxChunksPerPacket, streamForSendChunks, q); out.flush(); LOG.info("Send last Chunk"); } catch (IOException e) { //socket error LOG.info("IOException in sendChunks"); throw ioeToSocketException(e); } sentEntireByteRange = true; } finally { if (clientTraceFmt != null) { final long endTime = System.nanoTime(); ClientTraceLog.info(String.format(clientTraceFmt, totalRead, initialOffset, endTime - startTime)); } close(); } blockReadFully = initialOffset == 0 && offset >= replicaVisibleLength; return totalRead; }
From source file:org.eclipse.smila.connectivity.framework.crawler.jdbc.JdbcCrawler.java
/** * {@inheritDoc}//from ww w . j a va2 s.c om * */ @Override public void initialize(final DataSourceConnectionConfig config) throws CrawlerException, CrawlerCriticalException { if (_log.isDebugEnabled()) { _log.debug("Initializing JdbcCrawler..."); } synchronized (_openedMonitor) { if (_opened) { throw new CrawlerCriticalException( "Crawler is already busy. This should not be the case when initializing."); } _opened = true; _forceClosing = false; } _performanceCounters = new CrawlerPerformanceCounterHelper<JdbcCrawlerPerformanceAgent>(config, hashCode(), JdbcCrawlerPerformanceAgent.class); _isProducerRunning = true; _internalQueue = new ArrayBlockingQueue<DataReference>(INTERNAL_QUEUE_CAPACITY); _dataSourceID = config.getDataSourceID(); final Attributes attributes = config.getAttributes(); final List<IAttribute> attributeList = attributes.getAttribute(); _attributes = attributeList.toArray(new Attribute[attributeList.size()]); _process = (Process) config.getProcess(); _recordCache = new HashMap<ConnectivityId, Record>(); _producerThread = new CrawlingProducerThread(); _producerThread.start(); }
From source file:efen.parsewiki.WikipediaDocumentSequence.java
@Override public DocumentIterator iterator() throws IOException { final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); saxParserFactory.setNamespaceAware(true); final MutableString nameSpaceAccumulator = new MutableString(); final ObjectOpenHashSet<MutableString> nameSpacesAccumulator = new ObjectOpenHashSet<MutableString>(); final ArrayBlockingQueue<DocumentFactory> freeFactories = new ArrayBlockingQueue<DocumentFactory>(16); for (int i = freeFactories.remainingCapacity(); i-- != 0;) freeFactories.add(this.factory.copy()); final ArrayBlockingQueue<DocumentAndFactory> readyDocumentsAndFactories = new ArrayBlockingQueue<DocumentAndFactory>( freeFactories.size());//from ww w . j a v a 2s . com final SAXParser parser; try { parser = saxParserFactory.newSAXParser(); } catch (Exception e) { throw new RuntimeException(e.getMessage(), e); } final DefaultHandler handler = new DefaultHandler() { private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); private boolean inText; private boolean inTitle; private boolean inId; private boolean inTimestamp; private boolean inNamespaceDef; private boolean redirect; private MutableString text = new MutableString(); private MutableString title = new MutableString(); private MutableString id = new MutableString(); private MutableString timestamp = new MutableString(); private final Reference2ObjectMap<Enum<?>, Object> metadata = new Reference2ObjectOpenHashMap<Enum<?>, Object>(); { metadata.put(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "UTF-8"); metadata.put(MetadataKeys.REDIRECT, redirectAnchors); } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { if ("page".equals(localName)) { redirect = inText = inTitle = inId = inTimestamp = false; text.length(0); title.length(0); id.length(0); timestamp.length(0); } else if ("text".equals(localName)) inText = true; else if ("title".equals(localName) && title.length() == 0) inTitle = true; // We catch only the first id/title elements. else if ("id".equals(localName) && id.length() == 0) inId = true; else if ("timestamp".equals(localName) && timestamp.length() == 0) inTimestamp = true; else if ("redirect".equals(localName)) { redirect = true; if (attributes.getValue("title") != null) // Accumulate the title of the page as virtual text of the redirect page. synchronized (redirectAnchors) { final String link = Encoder.encodeTitleToUrl(attributes.getValue("title"), true); redirectAnchors.add( new AnchorExtractor.Anchor(new MutableString(baseURL.length() + link.length()) .append(baseURL).append(link), title.copy())); } } else if ("namespace".equals(localName)) { // Found a new namespace inNamespaceDef = true; nameSpaceAccumulator.length(0); } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if ("namespace".equals(localName)) { // Collecting a namespace if (nameSpaceAccumulator.length() != 0) nameSpacesAccumulator.add(nameSpaceAccumulator.copy().toLowerCase()); return; } if ("namespaces".equals(localName)) { // All namespaces collected nameSpaces = ImmutableSet.copyOf(nameSpacesAccumulator); return; } if (!redirect) { if ("title".equals(localName)) { // Set basic metadata for the page metadata.put(PropertyBasedDocumentFactory.MetadataKeys.TITLE, title.copy()); String link = Encoder.encodeTitleToUrl(title.toString(), true); metadata.put(PropertyBasedDocumentFactory.MetadataKeys.URI, new MutableString(baseURL.length() + link.length()).append(baseURL).append(link)); inTitle = false; } else if ("id".equals(localName)) { metadata.put(MetadataKeys.ID, Long.valueOf(id.toString())); inId = false; } else if ("timestamp".equals(localName)) { try { metadata.put(MetadataKeys.LASTEDIT, dateFormat.parse(timestamp.toString())); } catch (ParseException e) { throw new RuntimeException(e.getMessage(), e); } inTimestamp = false; } else if ("text".equals(localName)) { inText = false; if (!keepNamespaced) { // Namespaces are case-insensitive and language-dependent final int pos = title.indexOf(':'); if (pos != -1 && isATrueNamespace(title.substring(0, pos))) return; } try { final MutableString html = new MutableString(); DocumentFactory freeFactory; try { freeFactory = freeFactories.take(); } catch (InterruptedException e) { throw new RuntimeException(e.getMessage(), e); } if (parseText) { if (DISAMBIGUATION.search(text) != -1) { // It's a disambiguation page. /* Roi's hack: duplicate links using the page title, so the generic name will end up as anchor text. */ final MutableString newLinks = new MutableString(); for (int start = 0, end; (start = BRACKETS_OPEN.search(text, start)) != -1; start = end) { end = start; final int endOfLink = text.indexOfAnyOf(END_OF_DISAMBIGUATION_LINK, start); // Note that we don't escape title because we are working at the Wikipedia raw text level. if (endOfLink != -1) { newLinks.append(text.array(), start, endOfLink - start).append('|') .append(title).append("]]\n"); end = endOfLink; } end++; } text.append(newLinks); } // We separate categories by OXOXO, so we don't get overflowing phrases. final MutableString category = new MutableString(); for (int start = 0, end; (start = CATEGORY_START.search(text, start)) != -1; start = end) { end = BRACKETS_CLOSED.search(text, start += CATEGORY_START.length()); if (end != -1) category.append(text.subSequence(start, end)).append(" OXOXO "); else break; } metadata.put(MetadataKeys.CATEGORY, category); // Heuristics to get the first paragraph metadata.put(MetadataKeys.FIRSTPAR, new MutableString()); String plainText = new WikiModel(imageBaseURL, linkBaseURL) .render(new PlainTextConverter(true), text.toString()); for (int start = 0; start < plainText.length(); start++) { //System.err.println("Examining " + plainText.charAt( start ) ); if (Character.isWhitespace(plainText.charAt(start))) continue; if (plainText.charAt(start) == '{') { //System.err.print( "Braces " + start + " text: \"" + plainText.subSequence( start, start + 10 ) + "\" -> " ); start = BRACES_CLOSED.search(plainText, start); //System.err.println( start + " text: \"" + plainText.subSequence( start, start + 10 ) + "\"" ); if (start == -1) break; start++; } else if (plainText.charAt(start) == '[') { start = BRACKETS_CLOSED.search(plainText, start); if (start == -1) break; start++; } else { final int end = plainText.indexOf('\n', start); if (end != -1) metadata.put(MetadataKeys.FIRSTPAR, new MutableString(plainText.substring(start, end)));//new MutableString( new WikiModel( imageBaseURL, linkBaseURL ).render( new PlainTextConverter( true ), text.substring( start, end ).toString() ) ) ); break; } } try { WikiModel wikiModel = new WikiModel(imageBaseURL, linkBaseURL); wikiModel.render(new HTMLConverter(), text.toString(), html, false, false); final Map<String, String> categories = wikiModel.getCategories(); // Put back category links in the page (they have been parsed by bliki and to not appear anymore in the HTML rendering) for (Entry<String, String> entry : categories.entrySet()) { final String key = entry.getKey(); final String value = entry.getValue().trim(); if (value.length() != 0) // There are empty such things html.append("\n<a href=\"").append(baseURL).append("Category:") .append(Encoder.encodeTitleToUrl(key, true)).append("\">") .append(HtmlEscapers.htmlEscaper().escape(key)) .append("</a>\n"); } } catch (Exception e) { LOGGER.error("Unexpected exception while parsing " + title, e); } } readyDocumentsAndFactories.put(new DocumentAndFactory( freeFactory.getDocument(IOUtils.toInputStream(html, Charsets.UTF_8), new Reference2ObjectOpenHashMap<Enum<?>, Object>(metadata)), freeFactory)); } catch (InterruptedException e) { throw new RuntimeException(e.getMessage(), e); } catch (IOException e) { throw new RuntimeException(e.getMessage(), e); } } } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (inText && parseText) text.append(ch, start, length); if (inTitle) title.append(ch, start, length); if (inId) id.append(ch, start, length); if (inTimestamp) timestamp.append(ch, start, length); if (inNamespaceDef) { nameSpaceAccumulator.append(ch, start, length); inNamespaceDef = false; // Dirty, but it works } } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { if (inText && parseText) text.append(ch, start, length); if (inTitle) title.append(ch, start, length); } }; final Thread parsingThread = new Thread() { public void run() { try { InputStream in = new FileInputStream(wikipediaXmlDump); if (bzipped) in = new BZip2CompressorInputStream(in); parser.parse( new InputSource(new InputStreamReader(new FastBufferedInputStream(in), Charsets.UTF_8)), handler); readyDocumentsAndFactories.put(END); } catch (Exception e) { throw new RuntimeException(e.getMessage(), e); } } }; parsingThread.start(); return new AbstractDocumentIterator() { private DocumentFactory lastFactory; @Override public Document nextDocument() throws IOException { try { final DocumentAndFactory documentAndFactory = readyDocumentsAndFactories.take(); if (lastFactory != null) freeFactories.put(lastFactory); if (documentAndFactory == END) return null; lastFactory = documentAndFactory.factory; return documentAndFactory.document; } catch (InterruptedException e) { throw new RuntimeException(e.getMessage(), e); } } }; }