Java tutorial
/** * Copyright (C) 2013 Seajas, the Netherlands. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 3, as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.seajas.search.contender.service.modifier; import com.seajas.search.bridge.jms.model.Archive; import com.seajas.search.contender.scripting.ResolvingTransformerCache; import com.seajas.search.contender.service.cache.CacheService; import com.sun.syndication.feed.synd.SyndEntry; import com.sun.syndication.feed.synd.SyndFeed; import com.sun.syndication.io.FeedException; import com.sun.syndication.io.SyndFeedInput; import com.sun.syndication.io.XmlReader; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.StringReader; import java.io.StringWriter; import java.io.Writer; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Enumeration; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Vector; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import org.apache.commons.compress.utils.IOUtils; import org.apache.commons.net.ftp.FTPClient; import org.apache.commons.net.ftp.FTPFile; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import org.springframework.util.StringUtils; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParserFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactoryConfigurationError; import javax.xml.transform.sax.SAXSource; import javax.xml.transform.stream.StreamResult; /** * Modifier service * * @author Jasper van Veghel <jasper@seajas.com> */ @Service public class ArchiveModifierService extends AbstractModifierService { /** * The logger. */ private static final Logger logger = LoggerFactory.getLogger(ArchiveModifierService.class); /** * The transformer cache. */ @Autowired private ResolvingTransformerCache transformerCache; /** * The auto-detect parser. */ @Autowired private AutoDetectParser autoDetectParser; /** * The cache service. */ @Autowired private CacheService cacheService; /** * The packages location (not including the specific folder (%1.)) */ @Value("${contender.project.archives.location}${file.separator}%1${file.separator}packages") private String packagesLocation; /** * The results location (not including the specific folder (%1.)) */ @Value("${contender.project.archives.location}${file.separator}%1${file.separator}results") private String resultsLocation; /** * Default constructor. */ public ArchiveModifierService() { super(); } /** * Default constructor. * * @param maximumContentLength * @param preferredEnclosures */ @Autowired public ArchiveModifierService( @Value("${contender.project.http.maximum.result.size}") final Long maximumContentLength, @Value("${contender.project.rss.reader.preferred.enclosures}") final String preferredEnclosures) { super(maximumContentLength, preferredEnclosures); } /** * Test an archive connection by logging onto the given URL and retrieving a directory listing. * * @param uri * @return boolean */ public boolean testConnection(final URI uri) { if (!uri.getScheme().equalsIgnoreCase("ftp") && !uri.getScheme().equalsIgnoreCase("ftps")) { logger.error("Archive URL " + uri + " using protocol '" + uri.getScheme() + "' is not supported"); return false; } FTPClient ftpClient = retrieveFtpClient(uri); if (ftpClient != null) { try { logger.info("Retrieving testing content for archive with URI " + uri); // Retrieve a file listing ftpClient.listFiles(uri.getPath()); // Disconnect the client ftpClient.disconnect(); } catch (IOException e) { logger.error("Could not perform all FTP operations during archive testing", e); return false; } return true; } else return false; } /** * Handle the given archive's content by sending each entry to the given handler. * * @param archive * @param handler * @throws Exception */ public void handleArchive(final Archive archive, final ArchiveResultHandler handler) throws Exception { // Create a validating SAX parser final SAXParserFactory parserFactory = SAXParserFactory.newInstance(); parserFactory.setValidating(true); parserFactory.setNamespaceAware(true); // Create a common transformer per thread Transformer transformer = null; try { transformer = transformerCache.getTransformer(archive.getId(), "archive", archive.getModificationDate()); if (transformer == null) transformer = transformerCache.putContent(archive.getId(), "archive", archive.getModificationDate(), archive.getTransformerContent()); } catch (TransformerConfigurationException e) { logger.error("Unable to generate a (cached) transformer from the given content", e); return; } catch (TransformerFactoryConfigurationError e) { logger.error("Unable to generate a (cached) transformer from the given content", e); return; } // Store and process the files try { Map<File, String> storedResults = storeAndDecompressFiles(archive); List<String> deletedLinks = new ArrayList<String>(); // Only process deletes when a deletion expression has been provided if (StringUtils.hasText(archive.getDeletionExpression())) { // Process all entries beforehand, so to exclude deletes from the final result for (Map.Entry<File, String> storedResult : storedResults.entrySet()) { File storedResultsFolder = storedResult.getKey(); for (File entryLocation : storedResultsFolder.listFiles()) if (entryLocation.getName().matches(archive.getDeletionExpression())) { String deleteLink = entryLocation.getName().replaceAll(archive.getDeletionExpression(), archive.getInternalLink()); deletedLinks.add(deleteLink); // Delete the actual link cacheService .addDeleted(new CacheService.DeletedEntry(archive.getCollection(), deleteLink)); } } } // Now process the stored results themselves for (Map.Entry<File, String> storedResult : storedResults.entrySet()) { File storedResultsFolder = storedResult.getKey(); // Create the descriptions folder if it doesn't already exist File descriptionsFolderLocation = new File(storedResultsFolder, "descriptions"); if (!descriptionsFolderLocation.exists()) descriptionsFolderLocation.mkdirs(); for (File entryLocation : storedResultsFolder.listFiles()) { if (entryLocation.isDirectory()) { if (!entryLocation.getName().equals("descriptions")) logger.warn("Unknown folder '" + entryLocation.getName() + "'found in decompressed archive folder '" + storedResultsFolder.getAbsolutePath() + "'"); continue; } else if (StringUtils.hasText(archive.getDeletionExpression()) && entryLocation.getName().matches(archive.getDeletionExpression())) continue; InputStream transformationInputStream = null; try { transformationInputStream = new BufferedInputStream(new FileInputStream(entryLocation)); // Now determine the content type and create a reader in case of structured content MediaType entryMediaType = autoDetectParser.getDetector().detect(transformationInputStream, new Metadata()); if (!(entryMediaType.getSubtype().equals("xml") || entryMediaType.getSubtype().endsWith("+xml"))) { logger.warn("Archive entry " + entryLocation.getAbsolutePath() + " contains " + entryMediaType + " data which is unstructured, ignoring"); continue; } } catch (IOException e) { logger.error("Could not close input stream during archive processing", e); if (transformationInputStream != null) transformationInputStream.close(); continue; } // Process it as (semi-)structured content XmlReader transformationReader = new XmlReader(transformationInputStream, true); StringWriter transformationWriter = new StringWriter(); try { transformer.setOutputProperty(OutputKeys.ENCODING, "UTF8"); // Use a SAX reader for entity resolution XMLReader xmlReader = parserFactory.newSAXParser().getXMLReader(); InputSource inputSource = new InputSource(transformationReader); xmlReader.setEntityResolver(transformerCache.getEntityResolver()); inputSource.setSystemId("file://" + transformerCache.getDtdImportPath() + "/template.xsl"); // Perform the actual transformation transformer.setParameter("substituteUrl", archive.getInternalLink()); transformer.transform(new SAXSource(xmlReader, inputSource), new StreamResult(transformationWriter)); } catch (TransformerException e) { logger.error("Unable to perform content transformation for entry " + entryLocation.getAbsolutePath()); continue; } catch (SAXException e) { logger.error("Unable to perform content transformation for entry " + entryLocation.getAbsolutePath()); continue; } catch (ParserConfigurationException e) { logger.error("Unable to perform content transformation for entry " + entryLocation.getAbsolutePath()); continue; } finally { transformationInputStream.close(); transformationReader.close(); } // Create a syndication feed from the given result String resultContent = transformationWriter.toString(); SyndFeed resultFeed = null; try { SyndFeedInput feedInput = new SyndFeedInput(); resultFeed = feedInput.build(new StringReader(resultContent)); } catch (FeedException e) { logger.error("Could not parse the feed resulting from the archive entry transformation"); continue; } finally { transformationWriter.close(); } // Write the <description> content to a separate file and add it as an <enclosure> if (resultFeed.getEntries().size() > 0) { Integer entryNumber = 0; for (SyndEntry feedEntry : (Collection<SyndEntry>) resultFeed.getEntries()) { if (!deletedLinks.contains(feedEntry.getLink())) { String description = feedEntry.getDescription().getValue().trim(); File descriptionLocation = new File(descriptionsFolderLocation, stripExtension(entryLocation.getName()) + "-" + entryNumber++ + (feedEntry.getDescription().getType().equals("text/html") ? ".html" : ".xml")); Writer descriptionWriter = new OutputStreamWriter( new FileOutputStream(descriptionLocation), "UTF-8"); if (!description.endsWith("</html>")) descriptionWriter.write("<html>\n<head>\n\t<title>" + feedEntry.getTitle() + "</title>\n</head>\n<body>\n"); descriptionWriter.write(description); if (!description.endsWith("</html>")) descriptionWriter.write("\n</body>\n</html>"); descriptionWriter.flush(); descriptionWriter.close(); // Remove the link from the processed cache should it already be in there, taking care of updates cacheService.deleteElement(feedEntry.getLink()); // Then offer it up to the handler if (logger.isDebugEnabled()) logger.debug("Adding result content (" + entryNumber + ") for archive entry with path " + entryLocation.getAbsolutePath()); try { // NOTE: The encoding of 'UTF-8' is implied for archive-related files handler.process(new URI(feedEntry.getLink()), archive.getHostname(), feedEntry, resultFeed); } catch (FeedException e) { logger.error(String.format( "Could not offer feed entry with link '%s' - invalid entry", feedEntry.getLink()), e); } catch (URISyntaxException e) { logger.error(String.format( "Could not offer feed entry with link '%s' - invalid link", feedEntry.getLink()), e); } } else logger.info("Skipping over feed entry with link '" + feedEntry.getLink() + "' - marked for deletion"); } } else if (logger.isDebugEnabled()) logger.debug("No entries were found in archive entry with path " + entryLocation.getAbsolutePath()); } logger.info("Finished processing archive with name " + storedResult.getValue()); // Now archive the entry in the cache cacheService.addArchived(storedResult.getValue()); } logger.info("Finishing archive populator thread"); } catch (IOException e) { logger.error("Could not close input stream during archive processing", e); } } /** * Store the not-already-cached files from the given URL and return their locations. * * @param archive * @return Map<File, String> */ private Map<File, String> storeAndDecompressFiles(final Archive archive) { Map<File, String> result = new HashMap<File, String>(); // Create the FTP client FTPClient ftpClient = retrieveFtpClient(archive.getUri()); try { // Retrieve the directory listing List<ArchiveFile> files = retrieveFiles(archive.getUri().getPath(), ftpClient, archive.getExclusionExpression()); Integer archiveNumber = -1, archiveTotal = files.size(); logger.info("Archive with name '" + archive.getName() + "' produced " + archiveTotal + " files"); // An empty archive typically indicates failure if (archiveTotal == 0) logger.warn("The given archive produced no entries - something probably went wrong"); // Handle all archive files for (ArchiveFile archiveFile : files) { archiveNumber++; // Check whether the file already exists within the cache String baseUrl = (StringUtils.hasText(archive.getUri().getScheme()) ? archive.getUri().getScheme() + "://" : "") + archive.getUri().getHost() + (archive.getUri().getPort() != -1 ? ":" + archive.getUri().getPort() : ""); if (!cacheService.isArchived(baseUrl + archiveFile.getFullPath())) { logger.info("Started decompressing archive " + archiveNumber + "/" + archiveTotal + " with name " + archiveFile.getFullPath()); // Write out the archive to disk so we can determine the MIME type File archiveFileFolder = new File(archiveFile .getTranslatedPath(packagesLocation.replace("%1", String.valueOf(archive.getId())))); if (!archiveFileFolder.exists()) archiveFileFolder.mkdirs(); File archiveFileLocation = new File(archiveFileFolder, archiveFile.getFile().getName()); InputStream archiveInputStream = ftpClient.retrieveFileStream(archiveFile.getFullPath()); OutputStream archiveOutputStream = new FileOutputStream(archiveFileLocation); IOUtils.copy(archiveInputStream, archiveOutputStream); archiveInputStream.close(); ftpClient.completePendingCommand(); archiveOutputStream.flush(); archiveOutputStream.close(); // Now unpack the archive and transform each file InputStream compressedArchiveInputStream = new FileInputStream(archiveFileLocation); // Now determine the content type and create a reader in case of structured content MediaType archiveMediaType = autoDetectParser.getDetector() .detect(new BufferedInputStream(compressedArchiveInputStream), new Metadata()); if (!(archiveMediaType.getType().equals("application") && archiveMediaType.getSubtype().equals("zip"))) { logger.warn("Archive file " + archiveFile.getFullPath() + " contains " + archiveMediaType + " data, which is not yet supported"); compressedArchiveInputStream.close(); continue; } else compressedArchiveInputStream.close(); // Create a new ZIP file from the given archive and decompress it ZipFile zipFile = new ZipFile(archiveFileLocation); File resultsLocationFolder = new File(archiveFile .getTranslatedPath(resultsLocation.replace("%1", String.valueOf(archive.getId())))); if (!resultsLocationFolder.exists()) resultsLocationFolder.mkdirs(); File resultsLocation = new File(resultsLocationFolder, stripExtension(archiveFile.getFile().getName())); if (!resultsLocation.exists()) resultsLocation.mkdirs(); logger.info("Started processing archive with name " + archiveFile.getFullPath()); Enumeration<? extends ZipEntry> zipEnumerator = zipFile.entries(); while (zipEnumerator.hasMoreElements()) { ZipEntry entry = zipEnumerator.nextElement(); // Store it locally first File entryLocation = new File(resultsLocation, entry.getName()); try { InputStream entryInputStream = zipFile.getInputStream(entry); OutputStream entryOutputStream = new FileOutputStream(entryLocation); IOUtils.copy(entryInputStream, entryOutputStream); entryInputStream.close(); entryOutputStream.close(); } catch (IOException e) { logger.error("Could not store the compressed archive entry on disk", e); continue; } } zipFile.close(); // Add it to the results result.put(resultsLocation, baseUrl + archiveFile.getFullPath()); logger.info("Finished processing archive with name " + archiveFile.getFullPath()); } else if (logger.isDebugEnabled()) logger.debug("Skipping previously processed archive with name " + archiveFile.getFullPath()); } } catch (IOException e) { logger.error("Could not close input stream during archive processing", e); } finally { try { if (ftpClient.isConnected()) ftpClient.disconnect(); } catch (IOException e) { logger.error("Could not disconnect the FTP client", e); } } return result; } /** * Recursively retrieve a directory listing from the FTP server. * * @param path * @param ftpClient * @param exclusionExpression * @return List<FTPFile> * @throws IOException */ private static List<ArchiveFile> retrieveFiles(final String path, final FTPClient ftpClient, final String exclusionExpression) throws IOException { List<FTPFile> files = Arrays.asList(ftpClient.listFiles(path)); List<ArchiveFile> result = new Vector<ArchiveFile>(); for (FTPFile file : files) if (!file.getName().equals(".") && !file.getName().equals("..")) { if (file.isDirectory()) { String folderPath = path + (path.endsWith("/") ? "" : "/") + file.getName(); if (!StringUtils.hasText(exclusionExpression) || !folderPath.matches(exclusionExpression)) result.addAll(retrieveFiles(folderPath, ftpClient, exclusionExpression)); } else if (file.isFile() && (!StringUtils.hasText(exclusionExpression) || !(path + "/" + file.getName()).matches(exclusionExpression))) result.add(new ArchiveFile(path, file)); } return result; } /** * Return the given name without the part after the last '.'. * * @param name * @return String */ private static String stripExtension(final String name) { if (name.lastIndexOf('.') > 0) return name.substring(0, name.lastIndexOf('.')); return name; } /** * Archive file class, which adds a parent folder indicator to an FTPFile. * * @author Jasper van Veghel <jasper@seajas.com> */ private static class ArchiveFile { /** * Parent folder. */ private final String parent; /** * Actual file. */ private final FTPFile file; /** * Default constructor. * * @param parent * @param file */ public ArchiveFile(final String parent, final FTPFile file) { this.parent = parent; this.file = file; } /** * Retrieve the parent. * * @return String */ public String getParent() { return parent; } /** * Retrieve the file. * * @return FTPFile */ public FTPFile getFile() { return file; } /** * Retrieve the full path. * * @return String */ public String getFullPath() { return getParent() + "/" + getFile().getName(); } /** * Retrieve a translated path. * * @param base * @return String */ public String getTranslatedPath(final String base) { if (base.endsWith(File.separator)) return base.substring(0, base.length() - 1) + parent.replace('/', File.separatorChar); else return base + parent.replace('/', File.separatorChar); } } /** * Archive result handler interface. * * @author Jasper van Veghel <jasper@seajas.com> */ public static interface ArchiveResultHandler { /** * Process the given archive result. * * @param uri * @param hostname * @param entry * @param feed */ void process(final URI uri, final String hostname, final SyndEntry entry, final SyndFeed feed) throws FeedException; } }