Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.crawler.frontier; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.httpclient.URIException; import org.archive.crawler.event.CrawlStateEvent; import org.archive.crawler.framework.Frontier; import org.archive.modules.CrawlURI; import org.archive.modules.SchedulingConstants; import org.archive.modules.extractor.Hop; import org.archive.modules.extractor.LinkContext; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.ApplicationListener; import org.springframework.context.Lifecycle; import com.rabbitmq.client.AMQP.BasicProperties; import com.rabbitmq.client.Channel; import com.rabbitmq.client.Connection; import com.rabbitmq.client.ConnectionFactory; import com.rabbitmq.client.Consumer; import com.rabbitmq.client.DefaultConsumer; import com.rabbitmq.client.Envelope; import com.rabbitmq.client.ShutdownSignalException; /** * @contributor nlevitt */ public class AMQPUrlReceiver implements Lifecycle, ApplicationListener<CrawlStateEvent> { @SuppressWarnings("unused") private static final long serialVersionUID = 1L; private static final Logger logger = Logger.getLogger(AMQPUrlReceiver.class.getName()); public static final String A_RECEIVED_FROM_AMQP = "receivedFromAMQP"; protected Frontier frontier; public Frontier getFrontier() { return this.frontier; } @Autowired public void setFrontier(Frontier frontier) { this.frontier = frontier; } protected String amqpUri = "amqp://guest:guest@localhost:5672/%2f"; public String getAmqpUri() { return this.amqpUri; } public void setAmqpUri(String uri) { this.amqpUri = uri; } protected String exchange = "umbra"; public String getExchange() { return exchange; } public void setExchange(String exchange) { this.exchange = exchange; } protected String queueName = "requests"; public String getQueueName() { return queueName; } public void setQueueName(String queueName) { this.queueName = queueName; } protected boolean isRunning = false; @Override public boolean isRunning() { return isRunning; } private transient Lock lock = new ReentrantLock(true); private class StarterRestarter extends Thread { public StarterRestarter(String name) { super(name); } @Override public void run() { while (!Thread.interrupted()) { try { lock.lockInterruptibly(); try { if (!isRunning) { // start up again try { Consumer consumer = new UrlConsumer(channel()); channel().exchangeDeclare(getExchange(), "direct", true); channel().queueDeclare(getQueueName(), false, false, true, null); channel().queueBind(getQueueName(), getExchange(), getQueueName()); channel().basicConsume(getQueueName(), false, consumer); isRunning = true; logger.info("started AMQP consumer uri=" + getAmqpUri() + " exchange=" + getExchange() + " queueName=" + getQueueName()); } catch (IOException e) { logger.log(Level.SEVERE, "problem starting AMQP consumer (will try again after 30 seconds)", e); } } Thread.sleep(30000); } finally { lock.unlock(); } } catch (InterruptedException e) { return; } } } } transient private StarterRestarter starterRestarter; @Override public void start() { lock.lock(); try { // spawn off a thread to start up the amqp consumer, and try to restart it if it dies if (!isRunning) { starterRestarter = new StarterRestarter( AMQPUrlReceiver.class.getSimpleName() + "-starter-restarter"); starterRestarter.start(); } } finally { lock.unlock(); } } @Override public void stop() { lock.lock(); try { logger.info("shutting down"); if (connection != null && connection.isOpen()) { try { connection.close(); } catch (IOException e) { logger.log(Level.SEVERE, "problem closing AMQP connection", e); } } if (starterRestarter != null && starterRestarter.isAlive()) { starterRestarter.interrupt(); try { starterRestarter.join(); } catch (InterruptedException e) { } } starterRestarter = null; connection = null; channel = null; isRunning = false; } finally { lock.unlock(); } } transient protected Connection connection = null; transient protected Channel channel = null; protected Connection connection() throws IOException { lock.lock(); try { if (connection != null && !connection.isOpen()) { logger.warning("connection is closed, creating a new one"); connection = null; } if (connection == null) { ConnectionFactory factory = new ConnectionFactory(); try { factory.setUri(getAmqpUri()); } catch (Exception e) { throw new IOException("problem with AMQP uri " + getAmqpUri(), e); } connection = factory.newConnection(); } return connection; } finally { lock.unlock(); } } protected Channel channel() throws IOException { lock.lock(); try { if (channel != null && !channel.isOpen()) { logger.warning("channel is not open, creating a new one"); channel = null; } if (channel == null) { channel = connection().createChannel(); } return channel; } finally { lock.unlock(); } } // XXX should we be using QueueingConsumer because of possible blocking in // frontier.schedule()? // "Note: all methods of this interface are invoked inside the Connection's // thread. This means they a) should be non-blocking and generally do little // work, b) must not call Channel or Connection methods, or a deadlock will // ensue. One way of ensuring this is to use/subclass QueueingConsumer." protected class UrlConsumer extends DefaultConsumer { public UrlConsumer(Channel channel) { super(channel); } @Override public void handleDelivery(String consumerTag, Envelope envelope, BasicProperties properties, byte[] body) throws IOException { String decodedBody; try { decodedBody = new String(body, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); // can't happen } JSONObject jo = new JSONObject(decodedBody); if ("GET".equals(jo.getString("method"))) { CrawlURI curi; try { curi = makeCrawlUri(jo); // bypasses scoping (unless rechecking is configured) getFrontier().schedule(curi); if (logger.isLoggable(Level.FINE)) { logger.fine("scheduled " + curi); } } catch (URIException e) { logger.log(Level.WARNING, "problem creating CrawlURI from json received via AMQP " + decodedBody, e); } catch (JSONException e) { logger.log(Level.SEVERE, "problem creating CrawlURI from json received via AMQP " + decodedBody, e); } } else { logger.warning("ignoring url with method other than GET - " + decodedBody); } this.getChannel().basicAck(envelope.getDeliveryTag(), false); } @Override public void handleShutdownSignal(String consumerTag, ShutdownSignalException sig) { if (!sig.isInitiatedByApplication()) { logger.log(Level.SEVERE, "amqp channel/connection unexpectedly shut down consumerTag=" + consumerTag, sig); } else { logger.info("amqp channel/connection shut down consumerTag=" + consumerTag); } isRunning = false; } // { // "headers": { // "Referer": "https://archive.org/", // "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/32.0.1700.102 Chrome/32.0.1700.102 Safari/537.36", // "Accept": "image/webp,*/*;q=0.8" // }, // "url": "https://analytics.archive.org/0.gif?server_ms=256&server_name=www19.us.archive.org&service=ao&loadtime=358&timediff=-8&locale=en-US&referrer=-&version=2&count=9", // "method": "GET" // } @SuppressWarnings("unchecked") protected CrawlURI makeCrawlUri(JSONObject jo) throws URIException, JSONException { JSONObject joHeaders = jo.getJSONObject("headers"); UURI uuri = UURIFactory.getInstance(jo.getString("url")); UURI via = UURIFactory.getInstance(jo.getString("parentUrl")); JSONObject parentUrlMetadata = jo.getJSONObject("parentUrlMetadata"); String parentHopPath = parentUrlMetadata.getString("pathFromSeed"); String hopPath = parentHopPath + Hop.INFERRED.getHopString(); CrawlURI curi = new CrawlURI(uuri, hopPath, via, LinkContext.INFERRED_MISC); // set the heritable data from the parent url, passed back to us via amqp // XXX brittle, only goes one level deep, and only handles strings and arrays, the latter of which it converts to a Set. // 'heritableData': {'source': 'https://facebook.com/whitehouse/', 'heritable': ['source', 'heritable']} JSONObject heritableData = parentUrlMetadata.getJSONObject("heritableData"); for (String key : (Set<String>) heritableData.keySet()) { Object value = heritableData.get(key); if (value instanceof JSONArray) { Set<String> valueSet = new HashSet<String>(); JSONArray arr = ((JSONArray) value); for (int i = 0; i < arr.length(); i++) { valueSet.add(arr.getString(i)); } curi.getData().put(key, valueSet); } else { curi.getData().put(key, heritableData.get(key)); } } // set the http headers from the amqp message Map<String, String> customHttpRequestHeaders = new HashMap<String, String>(); for (Object key : joHeaders.keySet()) { customHttpRequestHeaders.put(key.toString(), joHeaders.getString(key.toString())); } curi.getData().put("customHttpRequestHeaders", customHttpRequestHeaders); /* Use HighestUriQueuePrecedencePolicy to ensure these high priority * urls really get crawled ahead of others. * See https://webarchive.jira.com/wiki/display/Heritrix/Precedence+Feature+Notes */ curi.setSchedulingDirective(SchedulingConstants.HIGH); curi.setPrecedence(1); //curi.setForceFetch(true); curi.getAnnotations().add(A_RECEIVED_FROM_AMQP); return curi; } } @Override public void onApplicationEvent(CrawlStateEvent event) { switch (event.getState()) { case PAUSING: case PAUSED: if (channel != null && channel.isOpen()) { try { channel.flow(false); } catch (IOException e) { logger.log(Level.WARNING, "failed to pause flow on amqp channel", e); } } break; case RUNNING: case EMPTY: case PREPARING: if (channel != null && channel.isOpen()) { try { channel.flow(true); } catch (IOException e) { logger.log(Level.SEVERE, "failed to resume flow on amqp channel", e); } } break; default: } } }