Java tutorial
/** * Copyright (C) 2013 Seajas, the Netherlands. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 3, as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.seajas.search.contender.service.modifier; import com.seajas.search.bridge.profiler.model.modifier.Modifier; import com.seajas.search.bridge.profiler.model.modifier.ModifierFilter; import com.seajas.search.bridge.profiler.model.modifier.ModifierScript; import com.seajas.search.contender.WebResolverSettings; import com.seajas.search.contender.http.HttpClientFeedFetcher; import com.seajas.search.contender.http.SizeRestrictedHttpResponse; import com.seajas.search.contender.http.SizeRestrictedResponseHandler; import com.seajas.search.contender.replication.ModifierCache; import com.seajas.search.contender.scripting.XmlHtmlReader; import com.seajas.search.contender.service.builder.RSSDirectoryBuilder; import com.seajas.search.utilities.web.WebFeeds; import com.sun.syndication.feed.synd.SyndFeed; import com.sun.syndication.io.FeedException; import com.sun.syndication.io.SyndFeedInput; import com.sun.syndication.io.XmlReader; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; import java.util.Enumeration; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import org.apache.commons.io.IOUtils; import org.apache.commons.net.ftp.FTPClient; import org.apache.commons.net.ftp.FTPSClient; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.message.BasicHeader; import org.apache.http.params.CoreProtocolPNames; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.rometools.fetcher.FeedFetcher; import org.rometools.fetcher.FetcherException; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import org.springframework.util.StringUtils; import javax.script.ScriptException; /** * Feed modifier service. * * @author Jasper van Veghel <jasper@seajas.com> */ @Service public class FeedModifierService extends AbstractModifierService { /** * The logger. */ private static final Logger logger = LoggerFactory.getLogger(FeedModifierService.class); /** * The retrieval HTTP client. */ @Autowired @Qualifier("retrievalHttpClient") private HttpClient httpClient; /** * The feed fetcher. */ @Autowired @Qualifier("retrievalFeedFetcher") private FeedFetcher feedFetcher; /** * The modifier filter processor. */ @Autowired private ModifierFilterProcessor modifierFilterProcessor; /** * The modifier script processor. */ @Autowired private ModifierScriptProcessor modifierScriptProcessor; /** * The modifier cache. */ @Autowired private ModifierCache modifierCache; /** * The auto-detect parser. */ @Autowired private AutoDetectParser autoDetectParser; /** * Default constructor. */ public FeedModifierService() { super(); } /** * Default constructor. * * @param maximumContentLength * @param preferredEnclosures */ @Autowired public FeedModifierService( @Value("${contender.project.http.maximum.result.size}") final Long maximumContentLength, @Value("${contender.project.rss.reader.preferred.enclosures}") final String preferredEnclosures) { super(maximumContentLength, preferredEnclosures); } /** * Test a feed by retrieving the content and then discarding it. * * @param uri * @param encodingOverride * @param userAgent * @return boolean */ public boolean testConnection(final URI uri, final String encodingOverride, final String userAgent) { try { // Retrieve the reader, then close the stream logger.info("Retrieving testing content for feed with URI " + uri); Reader result = getContent(uri, encodingOverride, userAgent, null); if (result != null) result.close(); else { logger.error("Could not retrieve testing content for feed with URI " + uri); return false; } return true; } catch (IOException e) { logger.error("Could not retrieve testing content for feed with URI " + uri, e); return false; } } /** * Test a given feed modifier chain by its (feed) modifier ID. * * @param id * @param uri * @param encodingOverride * @param userAgent * @throws Exception * @return List<String, Boolean> */ public Map<String, Boolean> testModifier(Integer id, URI uri, String encodingOverride, String userAgent) throws Exception { WebResolverSettings settings = new WebResolverSettings(); settings.setMaximumContentLength(maximumContentLength); settings.setUserAgent(userAgent); Map<String, Boolean> result = new HashMap<String, Boolean>(); logger.info("Testing feed modifier with ID " + id + " and URI " + uri); try { Modifier modifier = modifierCache.getFeedModifierById(id); if (!Pattern.matches(modifier.getUrlExpression(), uri.toString())) throw new Exception("The given testing feed URI is not covered by the modifier expression"); Reader reader = getContent(uri, encodingOverride, userAgent, null); if (reader != null) { // Run it through the modifier for (ModifierFilter filter : modifier.getFilters()) { StringBuffer current = new StringBuffer(), updated = new StringBuffer(); reader = readerToBuffer(current, reader, false); reader = modifierFilterProcessor.process(filter, reader); reader = readerToBuffer(updated, reader, false); result.put("Filter_" + filter.getId(), !current.toString().equals(updated.toString())); reader.close(); } for (ModifierScript script : modifier.getScripts()) { StringBuffer current = new StringBuffer(), updated = new StringBuffer(); reader = readerToBuffer(current, reader, false); reader = modifierScriptProcessor.process(script, extractAndClose(reader), uri, settings, false); reader = readerToBuffer(updated, reader, false); result.put("Script_" + script.getId(), !current.toString().equals(updated.toString())); reader.close(); } } else throw new Exception("Could not retrieve the result feed content"); } catch (ScriptException e) { throw new Exception("Could not test the given feed: " + e.getMessage(), e); } catch (IOException e) { throw new Exception("Could not test the given feed: " + e.getMessage(), e); } return result; } /** * Retrieve a feed from the URL modified by the relevant modifiers. * * @param uri * @param encodingOverride * @param userAgent * @param resultHeaders * @param suppressErrors * @return SyndFeed */ public SyndFeed getFeed(URI uri, String encodingOverride, String userAgent, Map<String, String> resultParameters, Map<String, String> resultHeaders, Boolean suppressErrors) { WebResolverSettings settings = new WebResolverSettings(); settings.setMaximumContentLength(maximumContentLength); settings.setUserAgent(userAgent); settings.setResultParameters(resultParameters); settings.setResultHeaders(resultHeaders); try { SyndFeed resultFeed = null; // We can only retrieve unmodified feeds using conditional gets List<Modifier> modifiers = modifierCache.getFeedModifiersByUrlMatch(uri.toString()); if (modifiers.size() == 0 && (uri.getScheme().equalsIgnoreCase("http") || uri.getScheme().equalsIgnoreCase("https"))) { if (feedFetcher instanceof HttpClientFeedFetcher) resultFeed = ((HttpClientFeedFetcher) feedFetcher).retrieveFeed(userAgent, uri.toURL(), resultHeaders); else resultFeed = feedFetcher.retrieveFeed(userAgent, uri.toURL()); WebFeeds.validate(resultFeed, uri); } else { Reader reader = getContent(uri, encodingOverride, userAgent, resultHeaders); if (reader != null) { try { // Run it through the modifiers reader = executeModifiers(modifiers, reader, uri, settings); // Fill in the result feed SyndFeedInput feedInput = new SyndFeedInput(); resultFeed = feedInput.build(reader); } finally { reader.close(); } } else { logger.error("No content could be retrieved from the given URL. Skipping feed."); return null; } } return resultFeed; } catch (FetcherException e) { if (!suppressErrors) logger.error("Could not retrieve the given feed (" + uri + "): " + e.getMessage()); } catch (IllegalArgumentException e) { if (!suppressErrors) logger.error("Could not retrieve the given feed (" + uri + "): " + e.getMessage(), e); } catch (FeedException e) { if (!suppressErrors) logger.error("Could not retrieve the given feed (" + uri + "): " + e.getMessage(), e); } catch (ScriptException e) { if (!suppressErrors) logger.error("Could not retrieve the given feed (" + uri + "): " + e.getMessage(), e); } catch (IOException e) { if (!suppressErrors) logger.error("Could not retrieve the given feed (" + uri + "): " + e.getMessage(), e); } return null; } /** * Retrieve the content of a result feed URL. * * @param uri * @param encodingOverride * @param userAgent * @param resultHeaders * @return Reader */ private Reader getContent(final URI uri, final String encodingOverride, final String userAgent, final Map<String, String> resultHeaders) { Reader result = null; String contentType = null; // Retrieve the feed try { InputStream inputStream = null; if (uri.getScheme().equalsIgnoreCase("ftp") || uri.getScheme().equalsIgnoreCase("ftps")) { FTPClient ftpClient = uri.getScheme().equalsIgnoreCase("ftps") ? new FTPSClient() : new FTPClient(); try { ftpClient.connect(uri.getHost(), uri.getPort() != -1 ? uri.getPort() : 21); if (StringUtils.hasText(uri.getUserInfo())) { if (uri.getUserInfo().contains(":")) ftpClient.login(uri.getUserInfo().substring(0, uri.getUserInfo().indexOf(":")), uri.getUserInfo().substring(uri.getUserInfo().indexOf(":") + 1)); else ftpClient.login(uri.getUserInfo(), ""); inputStream = ftpClient.retrieveFileStream(uri.getPath()); } } finally { ftpClient.disconnect(); } } else if (uri.getScheme().equalsIgnoreCase("file")) { File file = new File(uri); if (!file.isDirectory()) inputStream = new FileInputStream(uri.getPath()); else inputStream = RSSDirectoryBuilder.build(file); } else if (uri.getScheme().equalsIgnoreCase("http") || uri.getScheme().equalsIgnoreCase("https")) { try { HttpGet method = new HttpGet(uri.toString()); if (resultHeaders != null) for (Entry<String, String> resultHeader : resultHeaders.entrySet()) method.setHeader(new BasicHeader(resultHeader.getKey(), resultHeader.getValue())); if (userAgent != null) method.setHeader(CoreProtocolPNames.USER_AGENT, userAgent); SizeRestrictedHttpResponse response = httpClient.execute(method, new SizeRestrictedResponseHandler(maximumContentLength, uri)); try { if (response != null) { inputStream = new ByteArrayInputStream(response.getResponse()); contentType = response.getContentType() != null ? response.getContentType().getValue() : null; } else return null; } catch (RuntimeException e) { method.abort(); throw e; } } catch (IllegalArgumentException e) { logger.error("Invalid URL " + uri.toString() + " - not returning content", e); return null; } } else { logger.error("Unknown protocol " + uri.getScheme() + ". Skipping feed."); return null; } // Guess the character encoding using ROME's reader, then buffer it so we can discard the input stream (and close the connection) InputStream readerInputStream = new BufferedInputStream(inputStream); MediaType mediaType = autoDetectParser.getDetector().detect(readerInputStream, new Metadata()); try { Reader reader = null; if (mediaType.getType().equals("application")) { if (mediaType.getSubtype().equals("x-gzip")) { GZIPInputStream gzipInputStream = new GZIPInputStream(readerInputStream); if (encodingOverride != null) reader = readerToBuffer(new StringBuffer(), new InputStreamReader(gzipInputStream, encodingOverride), false); else reader = readerToBuffer(new StringBuffer(), contentType != null ? new XmlHtmlReader(gzipInputStream, contentType, true) : new XmlReader(gzipInputStream, true), false); gzipInputStream.close(); } else if (mediaType.getSubtype().equals("zip")) { ZipFile zipFile = null; // ZipInputStream can't do read-aheads, so we have to use a temporary on-disk file instead File temporaryFile = File.createTempFile("profiler-", ".zip"); try { FileOutputStream zipOutputStream = new FileOutputStream(temporaryFile); IOUtils.copy(readerInputStream, zipOutputStream); readerInputStream.close(); zipOutputStream.flush(); zipOutputStream.close(); // Create a new entry and process it zipFile = new ZipFile(temporaryFile); Enumeration<? extends ZipEntry> zipEnumeration = zipFile.entries(); ZipEntry zipEntry = zipEnumeration.nextElement(); if (zipEntry == null || zipEntry.isDirectory() || zipEnumeration.hasMoreElements()) { logger.error( "ZIP files are currently expected to contain one and only one entry, which is to be a file"); return null; } // We currently only perform prolog stripping for ZIP files InputStream zipInputStream = new BufferedInputStream(zipFile.getInputStream(zipEntry)); if (encodingOverride != null) reader = readerToBuffer(new StringBuffer(), new InputStreamReader( new BufferedInputStream(zipInputStream), encodingOverride), true); else result = readerToBuffer(new StringBuffer(), contentType != null ? new XmlHtmlReader(new BufferedInputStream(zipInputStream), contentType, true) : new XmlReader(new BufferedInputStream(zipInputStream), true), true); } catch (Exception e) { logger.error("An error occurred during ZIP file processing", e); return null; } finally { if (zipFile != null) zipFile.close(); if (!temporaryFile.delete()) logger.error("Unable to delete temporary file"); } } } if (result == null) { if (encodingOverride != null) result = readerToBuffer(new StringBuffer(), reader != null ? reader : new InputStreamReader(readerInputStream, encodingOverride), false); else result = readerToBuffer(new StringBuffer(), reader != null ? reader : contentType != null ? new XmlHtmlReader(readerInputStream, contentType, true) : new XmlReader(readerInputStream, true), false); } } catch (Exception e) { logger.error("An error occurred during stream processing", e); return null; } finally { inputStream.close(); } } catch (IOException e) { logger.error("Could not retrieve the given feed: " + e.getMessage(), e); return null; } return result; } /** * Execute the given modifiers, passing in the relevant class-bound arguments. * * @param modifiers * @param reader * @param settings * @return Reader * @throws IOException * @throws ScriptException */ private Reader executeModifiers(List<Modifier> modifiers, Reader reader, URI uri, WebResolverSettings settings) throws IOException, ScriptException { Reader result = reader; for (Modifier modifier : modifiers) { for (ModifierFilter filter : modifier.getFilters()) result = modifierFilterProcessor.process(filter, result); for (ModifierScript script : modifier.getScripts()) { result = modifierScriptProcessor.process(script, extractAndClose(result), uri, settings, true); } } return result; } /** * Extract a String and close quietly. * * @param reader * @return String * @throws IOException */ private String extractAndClose(final Reader reader) throws IOException { try { return IOUtils.toString(reader); } finally { IOUtils.closeQuietly(reader); } } }