Java tutorial
/* * Copyright 2014 steve(at)threadswarm.com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.threadswarm.imagefeedarchiver.driver; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.concurrent.CompletionService; import java.util.concurrent.ConcurrentSkipListSet; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorCompletionService; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpHeaders; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.message.BasicHeader; import org.apache.http.params.CoreProtocolPNames; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.context.ApplicationContext; import org.springframework.context.ConfigurableApplicationContext; import org.springframework.context.support.ClassPathXmlApplicationContext; import com.threadswarm.imagefeedarchiver.FeedUtils; import com.threadswarm.imagefeedarchiver.dao.ProcessedRssItemDAO; import com.threadswarm.imagefeedarchiver.filter.ChainedRssItemFilter; import com.threadswarm.imagefeedarchiver.filter.PreviouslyDownloadedItemFilter; import com.threadswarm.imagefeedarchiver.filter.RssItemFilter; import com.threadswarm.imagefeedarchiver.model.ProcessedRssItem; import com.threadswarm.imagefeedarchiver.model.RssChannel; import com.threadswarm.imagefeedarchiver.model.RssItem; import com.threadswarm.imagefeedarchiver.parser.FeedParserException; import com.threadswarm.imagefeedarchiver.parser.RssDOMFeedParser; import com.threadswarm.imagefeedarchiver.processor.RssItemProcessor; public class CommandLineDriver implements Runnable { private final static Header DNT_HEADER = new BasicHeader("DNT", "1"); private final static Header RSS_ACCEPT_HEADER = new BasicHeader("Accept", "application/rss+xml, application/xml, text/xml"); private final static Logger LOGGER = LoggerFactory.getLogger(CommandLineDriver.class); // configuration parameters private final URI rssFeedUri; private final File outputDirectory; private final int threadCount; private final long downloadDelay; private final boolean doNotTrackRequested; private final boolean forceHttps; // components private final HttpClient httpClient; private final ProcessedRssItemDAO processedRssItemDAO; private CommandLineDriver(Builder builder) { this.rssFeedUri = builder.rssFeedUri; this.outputDirectory = builder.outputDirectory; this.threadCount = builder.threadCount; this.downloadDelay = builder.downloadDelay; this.doNotTrackRequested = builder.doNotTrackRequested; this.forceHttps = builder.forceHttps; this.httpClient = builder.httpClient; this.processedRssItemDAO = builder.processedRssItemDAO; } public static void main(String[] args) throws InterruptedException, ExecutionException, ParseException { // define available command-line options Options options = new Options(); options.addOption("h", "help", false, "display usage information"); options.addOption("u", "url", true, "RSS feed URL"); options.addOption("a", "user-agent", true, "User-Agent header value to use when making HTTP requests"); options.addOption("o", "output-directory", true, "output directory for downloaded images"); options.addOption("t", "thread-count", true, "number of worker threads, defaults to cpu-count + 1"); options.addOption("d", "delay", true, "delay between image downloads (in milliseconds)"); options.addOption("p", "notrack", false, "tell websites that you don't wish to be tracked (DNT)"); options.addOption("s", "https", false, "Rewrite image URLs to leverage SSL/TLS"); CommandLineParser commandLineParser = new BasicParser(); CommandLine commandLine = commandLineParser.parse(options, args); // print usage information if 'h'/'help' or no-args were given if (args.length == 0 || commandLine.hasOption("h")) { HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.printHelp("java -jar ImageFeedArchiver.jar", options); return; //abort execution } URI rssFeedUri = null; if (commandLine.hasOption("u")) { String rssFeedUrlString = commandLine.getOptionValue("u"); try { rssFeedUri = FeedUtils.getUriFromUrlString(rssFeedUrlString); } catch (MalformedURLException | URISyntaxException e) { LOGGER.error("The Feed URL you supplied was malformed or violated syntax rules.. exiting", e); System.exit(1); } LOGGER.info("Target RSS feed URL: {}", rssFeedUri); } else { throw new IllegalStateException("RSS feed URL was not specified!"); } File outputDirectory = null; if (commandLine.hasOption("o")) { outputDirectory = new File(commandLine.getOptionValue("o")); if (!outputDirectory.isDirectory()) throw new IllegalArgumentException("output directory must be a *directory*!"); LOGGER.info("Using output directory: '{}'", outputDirectory); } else { throw new IllegalStateException("output directory was not specified!"); } String userAgentString = null; if (commandLine.hasOption("a")) { userAgentString = commandLine.getOptionValue("a"); LOGGER.info("Setting 'User-Agent' header value to '{}'", userAgentString); } int threadCount; if (commandLine.hasOption("t")) { threadCount = Integer.parseInt(commandLine.getOptionValue("t")); } else { threadCount = Runtime.getRuntime().availableProcessors() + 1; } LOGGER.info("Using {} worker threads", threadCount); long downloadDelay = 0; if (commandLine.hasOption("d")) { String downloadDelayString = commandLine.getOptionValue("d"); downloadDelay = Long.parseLong(downloadDelayString); } LOGGER.info("Using a download-delay of {} milliseconds", downloadDelay); boolean doNotTrackRequested = commandLine.hasOption("p"); boolean forceHttps = commandLine.hasOption("s"); ApplicationContext context = new ClassPathXmlApplicationContext("META-INF/applicationContext.xml"); ((ConfigurableApplicationContext) context).registerShutdownHook(); HttpClient httpClient = (HttpClient) context.getBean("httpClient"); if (userAgentString != null) httpClient.getParams().setParameter(CoreProtocolPNames.USER_AGENT, userAgentString); ProcessedRssItemDAO processedRssItemDAO = (ProcessedRssItemDAO) context.getBean("processedRssItemDAO"); CommandLineDriver.Builder driverBuilder = new CommandLineDriver.Builder(rssFeedUri); driverBuilder.setDoNotTrackRequested(doNotTrackRequested).setOutputDirectory(outputDirectory) .setDownloadDelay(downloadDelay).setThreadCount(threadCount).setHttpClient(httpClient) .setForceHttps(forceHttps).setProcessedRssItemDAO(processedRssItemDAO); CommandLineDriver driver = driverBuilder.build(); driver.run(); } private RssChannel fetchRssChannel(URI targetUri) throws IOException, FeedParserException { RssChannel rssChannel = null; HttpEntity responseEntity = null; try { LOGGER.info("Attempting to fetch feed from URI: {}", targetUri.toString()); HttpGet rssFeedGet = new HttpGet(targetUri); if (doNotTrackRequested) { LOGGER.debug("Adding 'DNT' header to feed-fetch request"); rssFeedGet.addHeader(DNT_HEADER); } rssFeedGet.addHeader(RSS_ACCEPT_HEADER); HttpResponse imageResponse = httpClient.execute(rssFeedGet); responseEntity = imageResponse.getEntity(); String rssFeedXmlString = EntityUtils.toString(responseEntity); RssDOMFeedParser parser = new RssDOMFeedParser(); rssChannel = parser.readFeed(rssFeedXmlString); } finally { EntityUtils.consumeQuietly(responseEntity); } return rssChannel; } @Override public void run() { //setup filters List<RssItemFilter> filterList = new LinkedList<RssItemFilter>(); filterList.add(new PreviouslyDownloadedItemFilter(processedRssItemDAO)); RssItemFilter chainedItemFilter = new ChainedRssItemFilter(filterList); RssChannel rssChannel = null; try { rssChannel = fetchRssChannel(rssFeedUri); } catch (IOException | FeedParserException e) { LOGGER.error( "An Exception was thrown while attempting to download and parse the target RSS feed.. exiting", e); System.exit(1); } List<RssItem> filteredItemList = new LinkedList<RssItem>(); if (rssChannel != null && rssChannel.getItems() != null) { for (RssItem rssItem : rssChannel.getItems()) { rssItem = chainedItemFilter.filter(rssItem); if (rssItem != null) filteredItemList.add(rssItem); } } if (!filteredItemList.isEmpty()) { //create list of headers to be used when downloading images List<Header> headerList = new ArrayList<Header>(2); if (doNotTrackRequested) { LOGGER.debug("Adding 'DNT' header to worker requests"); headerList.add(DNT_HEADER); } headerList.add(new BasicHeader(HttpHeaders.REFERER, rssFeedUri.toString())); headerList = Collections.unmodifiableList(headerList); ExecutorService executorService = null; try { executorService = Executors.newFixedThreadPool(threadCount); CompletionService<ProcessedRssItem> completionService = new ExecutorCompletionService<ProcessedRssItem>( executorService); Set<URI> processedURISet = new ConcurrentSkipListSet<URI>(); int itemCount = 0; for (RssItem rssItem : filteredItemList) { completionService.submit(new RssItemProcessor(httpClient, rssItem, processedRssItemDAO, outputDirectory, headerList, processedURISet, downloadDelay, forceHttps)); itemCount++; } LOGGER.info("{} jobs submitted for execution", itemCount); for (int x = 0; x < itemCount; x++) { ProcessedRssItem processedItem = completionService.take().get(); LOGGER.info("Item status: {} --> [{}]", processedItem.getRssItem().getTitle(), processedItem.getDownloadStatus()); } } catch (InterruptedException e) { LOGGER.warn("Thread interrupted while blocking", e); Thread.currentThread().interrupt(); // restore interrupt } catch (ExecutionException e) { LOGGER.error("An Exception was thrown during worker execution and subsequently propagated", e); e.printStackTrace(); } finally { executorService.shutdown(); try { executorService.awaitTermination(10, TimeUnit.SECONDS); } catch (InterruptedException e) { LOGGER.warn("Thread interrupted while blocking", e); Thread.currentThread().interrupt(); // restore interrupt } httpClient.getConnectionManager().shutdown(); } } } /** * Builder used for creating instances of {@link CommandLineDriver}. * <p> * This class implements the "fluent-builder" pattern which allows for * multiple setters to be called in a chain-like manner. * * @author steve(at)threadswarm.com */ public static class Builder { private URI rssFeedUri; private File outputDirectory; private int threadCount; private long downloadDelay; private boolean doNotTrackRequested; private boolean forceHttps; private HttpClient httpClient; private ProcessedRssItemDAO processedRssItemDAO; /** * Default no-arg constructor */ public Builder() { } /** * Constructor which accepts a {@code URI} as an argument. * <p> * This constructor can be used in lieu of the {@code setRssFeedUri(URI)} * method. That being said, a call to the aforementioned method * will result in the value provided to this constructor being * overwritten. * * @param rssFeedUri */ public Builder(URI rssFeedUri) { this.rssFeedUri = rssFeedUri; } /** * Sets the {@code URI} corresponding to the RSS feed to be parsed. * * @param rssFeedUri the URI corresponding to the RSS feed. * @return the Builder upon which the method call was invoked */ public Builder setRssFeedUri(URI rssFeedUri) { this.rssFeedUri = rssFeedUri; return this; } /** * Sets the directory in which downloaded images should be written. * * @param outputDirectory the directory to be used for storing downloaded images * @return the Builder upon which the method call was invoked */ public Builder setOutputDirectory(File outputDirectory) { this.outputDirectory = outputDirectory; return this; } /** * Sets the number of threads to be used for downloading images. * <p> * This value sets the limit on the number of parallel downloads * from a given feed. Typically you should to set this to a * respectful value of three or less. * * @param threadCount the number of worker threads to be used for downloading images * @return the Builder upon which the method call was invoked */ public Builder setThreadCount(int threadCount) { this.threadCount = threadCount; return this; } /** * Sets the delay between image downloads in milliseconds. * <p> * Please note that this value is not global, rather it is on * a per-worker/thread basis. * * @param downloadDelay the delay between image downloads in milliseconds * @return the Builder upon which the method call was invoked */ public Builder setDownloadDelay(long downloadDelay) { this.downloadDelay = downloadDelay; return this; } /** * Sets a boolean flag indicating if the "DO NOT TRACK" header should be used. * <p> * A value of {@code true} indicates that the client wishes that the "DNT" header should * be included with a value of {@code 1}, telling the website that the user does not * wish to be tracked. * * @param doNotTrackRequested * @return the Builder upon which the method call was invoked */ public Builder setDoNotTrackRequested(boolean doNotTrackRequested) { this.doNotTrackRequested = doNotTrackRequested; return this; } /** * Sets a boolean flag indicating if "http://" URLs should be rewritten to use "https://". * * @param forceHttps * @return the Builder upon which the method call was invoked */ public Builder setForceHttps(boolean forceHttps) { this.forceHttps = forceHttps; return this; } /** * Sets the {@link HttpClient} to be used when making all HTTP requests. * * @param httpClient the HttClient instance to be used for all HTTP requests. * @return the Builder upon which the method call was invoked */ public Builder setHttpClient(HttpClient httpClient) { this.httpClient = httpClient; return this; } /** * Sets the {@link ProcessedRssItemDAO} instance to be used during operations. * * @param processedRssItemDAO * @return the Builder upon which the method call was invoked */ public Builder setProcessedRssItemDAO(ProcessedRssItemDAO processedRssItemDAO) { this.processedRssItemDAO = processedRssItemDAO; return this; } /** * Returns a configured instance of {@link CommandLineDriver}. * * @return an instance of CommandLineDriver that has been configured using this Builder */ public CommandLineDriver build() { return new CommandLineDriver(this); } } }