Java tutorial
/** * Copyright (C) 2013 Seajas, the Netherlands. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 3, as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.seajas.search.contender.service.modifier; import com.seajas.search.bridge.profiler.model.modifier.Modifier; import com.seajas.search.bridge.profiler.model.modifier.ModifierFilter; import com.seajas.search.bridge.profiler.model.modifier.ModifierScript; import com.seajas.search.contender.WebResolverSettings; import com.seajas.search.contender.http.SizeRestrictedHttpResponse; import com.seajas.search.contender.http.SizeRestrictedResponseHandler; import com.seajas.search.contender.replication.ModifierCache; import com.seajas.search.contender.scripting.XmlHtmlReader; import com.sun.syndication.feed.synd.SyndEntry; import com.sun.syndication.feed.synd.SyndFeed; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.Collection; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; import org.apache.http.Header; import org.apache.http.HttpHost; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.message.BasicHeader; import org.apache.http.params.CoreProtocolPNames; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.protocol.ExecutionContext; import org.apache.http.protocol.HttpContext; import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import org.springframework.util.StringUtils; import javax.script.ScriptException; /** * Modifier service for source elements. * * @author Jasper van Veghel <jasper@seajas.com> */ @Service public class SourceElementModifierService extends AbstractModifierService { /** * The logger. */ private static final Logger logger = LoggerFactory.getLogger(SourceElementModifierService.class); /** * The retrieval HTTP client. */ @Autowired @Qualifier("retrievalHttpClient") private HttpClient httpClient; /** * The modifier filter processor. */ @Autowired private ModifierFilterProcessor modifierFilterProcessor; /** * The modifier script processor. */ @Autowired private ModifierScriptProcessor modifierScriptProcessor; /** * The modifier cache. */ @Autowired private ModifierCache modifierCache; /** * The auto-detect parser. */ @Autowired private AutoDetectParser autoDetectParser; /** * Default constructor. */ public SourceElementModifierService() { super(); } /** * Default constructor. * * @param maximumContentLength * @param preferredEnclosures */ @Autowired public SourceElementModifierService( @Value("${contender.project.http.maximum.result.size}") final Long maximumContentLength, @Value("${contender.project.rss.reader.preferred.enclosures}") final String preferredEnclosures) { super(maximumContentLength, preferredEnclosures); } /** * Return a relevant result URI from the given feed. * * @param uri * @param feed * @return URL */ public URI getResultUri(final URI uri, final SyndFeed feed) { logger.info("Retrieving result URI from modifier feed URI " + uri); // Return the first URL to test against if (feed != null && feed.getEntries().size() > 0) for (SyndEntry entry : (Collection<SyndEntry>) feed.getEntries()) try { String protocol = new URL(entry.getLink()).getProtocol(); if (uri.getScheme().equalsIgnoreCase("file") && protocol.equalsIgnoreCase("file") || protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https")) try { return new URI(getEntryLink(entry)); } catch (URISyntaxException e) { logger.error("Unable to resolve URI", e); } } catch (MalformedURLException e) { logger.error("The given URL to test with is invalid", e); } return null; } /** * Test a given result modifier chain. * * @param modifierId * @param uri * @param encodingOverride * @param userAgent * @throws Exception * @return Map<String, Boolean> */ public Map<String, Boolean> testModifier(Integer modifierId, URI uri, String encodingOverride, String userAgent) throws Exception { WebResolverSettings settings = new WebResolverSettings(); settings.setMaximumContentLength(maximumContentLength); settings.setUserAgent(userAgent); Map<String, Boolean> result = new HashMap<String, Boolean>(); logger.info("Testing result modifier with ID " + modifierId + " and URI " + uri); Modifier modifier = modifierCache.getResultModifierById(modifierId); // Retrieve the content Content contentResult = getContent(uri, encodingOverride, null, userAgent); // Verify the expression in the same way the finalUrl is contrasted to the initial URL if (contentResult == null) throw new Exception("The given modifier content retrieval for testing purposes failed"); if (contentResult.getUri() != null && !contentResult.getUri().equals(uri)) { if (logger.isInfoEnabled()) logger.info("Modifier result has different final (post-redirect) URL from original: " + contentResult.getUri()); } if (!Pattern.matches(modifier.getUrlExpression(), (contentResult.getUri() != null ? contentResult.getUri() : uri).toString())) throw new Exception("The given testing result URL is not covered by the modifier expression"); InputStream inputStream = contentResult.getInputStream(); try { if (contentResult.getMediaType().startsWith("text/") || contentResult.getMediaType().contains("/xhtml") || contentResult.getMediaType().contains("/xml")) { Reader reader; if (encodingOverride != null) reader = new InputStreamReader(inputStream, encodingOverride); else reader = new XmlHtmlReader(inputStream, contentResult.getContentType(), true); try { for (ModifierFilter filter : modifier.getFilters()) { StringBuffer current = new StringBuffer(), updated = new StringBuffer(); reader = readerToBuffer(current, reader, false); reader = readerToBuffer(updated, modifierFilterProcessor.process(filter, reader), false); result.put("Filter_" + filter.getId(), !current.toString().equals(updated.toString())); } for (ModifierScript script : modifier.getScripts()) { StringBuffer current = new StringBuffer(), updated = new StringBuffer(); reader = readerToBuffer(current, reader, false); reader = readerToBuffer(updated, modifierScriptProcessor.process(script, extractAndClose(reader), contentResult.getUri() != null ? contentResult.getUri() : uri, settings, false), false); result.put("Script_" + script.getId(), !current.toString().equals(updated.toString())); } } catch (IOException e) { logger.error("Could not test the given result: " + e.getMessage(), e); } catch (ScriptException e) { logger.error("Could not test the given result: " + e.getMessage(), e); } reader.close(); } else throw new Exception("The given test feed URL does not contain appropriate content for testing (" + contentResult.getMediaType().toString() + ") - must be text or XML-based"); } catch (IOException e) { logger.error("Could not test the given result: " + e.getMessage(), e); } return result; } /** * Retrieve the result content for the given URI. * * @param encodingOverride * @param resultHeaders * @param userAgent * @return Content */ public Content getContent(final URI resultUri, final String encodingOverride, final Map<String, String> resultHeaders, final String userAgent) { URI uriAfterRedirects = null; // Retrieve the content Header contentType = null; try { InputStream inputStream; // Local file streams can only be read if the parent scheme is also local if (!resultUri.getScheme().equalsIgnoreCase("file")) { HttpGet method = new HttpGet(resultUri); if (resultHeaders != null) for (Entry<String, String> resultHeader : resultHeaders.entrySet()) method.setHeader(new BasicHeader(resultHeader.getKey(), resultHeader.getValue())); if (userAgent != null) method.setHeader(CoreProtocolPNames.USER_AGENT, userAgent); HttpContext context = new BasicHttpContext(); SizeRestrictedHttpResponse response = httpClient.execute(method, new SizeRestrictedResponseHandler(maximumContentLength, resultUri), context); if (response != null) { HttpUriRequest currentRequest = (HttpUriRequest) context .getAttribute(ExecutionContext.HTTP_REQUEST); HttpHost currentHost = (HttpHost) context.getAttribute(ExecutionContext.HTTP_TARGET_HOST); try { uriAfterRedirects = new URI(currentHost.toURI()).resolve(currentRequest.getURI()); } catch (URISyntaxException e) { logger.error(String.format("Final URI '%s' is mysteriously invalid", currentHost.toURI()), e); } inputStream = new ByteArrayInputStream(response.getResponse()); contentType = response.getContentType(); } else return null; } else inputStream = new FileInputStream(resultUri.getPath()); // Convert the stream to a reset-able one ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); IOUtils.copy(inputStream, outputStream); inputStream.close(); inputStream = new ByteArrayInputStream(outputStream.toByteArray()); outputStream.close(); // Now determine the content type and create a reader in case of structured content Metadata metadata = new Metadata(); if (encodingOverride != null && contentType != null && StringUtils.hasText(contentType.getValue())) { MediaType type = MediaType.parse(contentType.getValue()); metadata.add(HttpHeaders.CONTENT_TYPE, type.getType() + "/" + type.getSubtype() + "; charset=" + encodingOverride); } else if (contentType != null && StringUtils.hasText(contentType.getValue())) metadata.add(HttpHeaders.CONTENT_TYPE, contentType.getValue()); else if (encodingOverride != null) metadata.add(HttpHeaders.CONTENT_ENCODING, encodingOverride); MediaType mediaType = autoDetectParser.getDetector().detect(inputStream, metadata); return new Content(new ByteArrayInputStream(outputStream.toByteArray()), mediaType.getBaseType() + "/" + mediaType.getSubtype(), contentType != null ? contentType.getValue() : null, uriAfterRedirects != null ? uriAfterRedirects : resultUri); } catch (IOException e) { logger.error("Could not retrieve the given URL", e); return null; } } /** * Apply the given modifiers, passing in the relevant class-bound arguments. * * @param reader * @param uri * @param userAgent * @param resultParameters * @param resultHeaders * @return Reader * @throws IOException * @throws ScriptException */ public Reader applyModifiers(Reader reader, URI uri, String userAgent, Map<String, String> resultParameters, Map<String, String> resultHeaders) throws IOException, ScriptException { WebResolverSettings settings = new WebResolverSettings(); settings.setMaximumContentLength(maximumContentLength); settings.setUserAgent(userAgent); settings.setResultParameters(resultParameters); settings.setResultHeaders(resultHeaders); Reader result = reader; for (Modifier modifier : modifierCache.getResultModifiersByUrlMatch(uri.toString())) { for (ModifierFilter filter : modifier.getFilters()) result = modifierFilterProcessor.process(filter, result); for (ModifierScript script : modifier.getScripts()) result = modifierScriptProcessor.process(script, extractAndClose(result), uri, settings, true); } return result; } /** * Extract a String and close quietly. * * @param reader * @return String * @throws IOException */ private String extractAndClose(final Reader reader) throws IOException { try { return IOUtils.toString(reader); } finally { IOUtils.closeQuietly(reader); } } /** * Intermediate content model. * * @author Jasper van Veghel <jasper@seajas.com> */ public static class Content { /** * The content's input as a stream. */ private InputStream inputStream; /** * The media type (sans parameters). */ private String mediaType; /** * The "Content-Type" header. */ private String contentType; /** * The URI after redirect processing. */ private URI uri; /** * Default constructor. * * @param inputStream * @param mediaType * @param contentType * @param uri */ public Content(final InputStream inputStream, final String mediaType, final String contentType, final URI uri) { this.inputStream = inputStream; this.mediaType = mediaType; this.contentType = contentType; this.uri = uri; } /** * Retrieve the inputStream. * * @return InputStream */ public InputStream getInputStream() { return inputStream; } /** * Retrieve the mediaType. * * @return String */ public String getMediaType() { return mediaType; } /** * Retrieve the contentType. * * @return String */ public String getContentType() { return contentType; } /** * Retrieve the uri. * * @return URI */ public URI getUri() { return uri; } } }