Java tutorial
/******************************************************************************* * Copyright 2013 Universit degli Studi di Firenze * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package it.drwolf.ridire.session.async; import it.drwolf.ridire.cleaners.utils.StringWithEncoding; import it.drwolf.ridire.entity.CommandParameter; import it.drwolf.ridire.entity.CrawledResource; import it.drwolf.ridire.entity.Job; import it.drwolf.ridire.entity.Parameter; import it.drwolf.ridire.util.MD5DigestCreator; import it.drwolf.ridire.utility.RIDIREReTagger; import java.io.BufferedInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.io.Writer; import java.util.ArrayList; import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Random; import java.util.Set; import java.util.UUID; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import javax.persistence.EntityManager; import javax.persistence.RollbackException; import javax.transaction.HeuristicMixedException; import javax.transaction.HeuristicRollbackException; import javax.transaction.NotSupportedException; import javax.transaction.SystemException; import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.commons.exec.CommandLine; import org.apache.commons.exec.DefaultExecutor; import org.apache.commons.exec.ExecuteStreamHandler; import org.apache.commons.exec.ExecuteWatchdog; import org.apache.commons.exec.PumpStreamHandler; import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.filefilter.PrefixFileFilter; import org.apache.commons.lang.text.StrTokenizer; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetMatch; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveReaderFactory; import org.archive.io.ArchiveRecord; import org.archive.io.arc.ARCRecord; import org.archive.io.arc.ARCRecordMetaData; import org.jboss.seam.Component; import org.jboss.seam.ScopeType; import org.jboss.seam.annotations.In; import org.jboss.seam.contexts.Lifecycle; import org.jboss.seam.faces.Renderer; import org.jboss.seam.transaction.UserTransaction; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import de.spieleck.app.cngram.NGramProfiles; public class Mapper implements Runnable { private interface OutputType { ContentHandler getContentHandler(String encoding, Writer writer) throws TransformerConfigurationException; } private static final int PDFCLEANER_TIMEOUT = 300000; public static boolean isValidPos(String pos) { if (Mapper.notWordPoSs.contains(pos)) { return false; } return true; } private Integer jobId; private EntityManager em = null; private boolean running = false; private Job job; private UserTransaction mapperUserTx; private static final String ATTICDIR = "attic/"; private static final int BUFLENGTH = 8192; private static final String ITALIAN = "it"; private static final long TREETAGGER_TIMEOUT = 240000; // 4 mins private static final long READABILITY_TIMEOUT = 360000; // 6 mins private static List<String> notWordPoSs = new ArrayList<String>() { /** * */ private static final long serialVersionUID = -3892423757401579609L; { this.add("PON"); this.add("SENT"); this.add("SYM"); } }; @In(create = true) private Renderer renderer; private String tempDir; private final OutputType HTML = new OutputType() { public ContentHandler getContentHandler(String encoding, Writer writer) throws TransformerConfigurationException { return Mapper.this.getTransformerHandler("html", encoding, writer); } }; private FlagBearer flagBearer; private RIDIREReTagger ridireReTagger; private final List<String> allowedCharsets = new ArrayList<String>() { /** * */ private static final long serialVersionUID = 2950026736924724677L; { this.add("UTF-8"); this.add("ISO-8859-1"); this.add("UTF-16BE"); this.add("UTF-16LE"); } }; private static final String ALCHEMY = "alchemy"; private static final String READABILITY = "readability"; @SuppressWarnings("unchecked") public static Integer countWordsFromPoSTagResource(String posTagResourceFileName) throws IOException { List<String> lines = FileUtils.readLines(new File(posTagResourceFileName)); Integer count = 0; StrTokenizer tokenizer = StrTokenizer.getTSVInstance(); for (String l : lines) { tokenizer.reset(l); String[] tokens = tokenizer.getTokenArray(); if (tokens.length == 3) { if (Mapper.isValidPos(tokens[1].trim())) { ++count; } } } return count; } public Mapper(Job job, FlagBearer flagBearer) { this.job = job; this.jobId = job.getId(); this.flagBearer = flagBearer; this.ridireReTagger = new RIDIREReTagger(null); } private void createArchivedResource(File f, CrawledResource cr, EntityManager entityManager) { // System.out.println(System.getProperty("java.io.tmpdir")); String posEnabled = this.em.find(Parameter.class, Parameter.POS_ENABLED.getKey()).getValue(); File resourceDir; int status = Parameter.FINISHED; try { resourceDir = new File(FilenameUtils.getFullPath(f.getCanonicalPath().replaceAll("__\\d+", "")) + JobMapperMonitor.RESOURCESDIR); if (!resourceDir.exists()) { FileUtils.forceMkdir(resourceDir); } ArchiveReader reader = ArchiveReaderFactory.get(f); ARCRecord record = (ARCRecord) reader.get(cr.getOffset()); record.skipHttpHeader(); byte[] buf = new byte[Mapper.BUFLENGTH]; int count = 0; String resourceFile = cr.getDigest() + ".gz"; GZIPOutputStream baos = new GZIPOutputStream(new FileOutputStream(new File(resourceDir, resourceFile))); while ((count = record.read(buf)) != -1) { baos.write(buf, 0, count); } baos.finish(); baos.close(); reader.close(); // long t1 = System.currentTimeMillis(); StringWithEncoding cleanText = this.createPlainTextResource(f, cr, entityManager); this.removeGZippedResource(resourceDir, resourceFile); // long t2 = System.currentTimeMillis(); // System.out.println("Creazione plain text: " + (t2 - t1)); String plainTextFileName = cr.getDigest() + ".txt"; if (cleanText != null && cleanText.getString() != null && cleanText.getString().trim().length() > 0 && cleanText.getCleaner() != null && (cleanText.getCleaner().equals(Mapper.ALCHEMY) || cleanText.getCleaner().equals(Mapper.READABILITY))) { cr.setCleaner(cleanText.getCleaner()); File plainTextFile = new File(resourceDir, plainTextFileName); FileUtils.writeStringToFile(plainTextFile, cleanText.getString(), cleanText.getEncoding()); cr.setExtractedTextHash(MD5DigestCreator.getMD5Digest(plainTextFile)); // language detection // t1 = System.currentTimeMillis(); String language = this.detectLanguage(cleanText.getString()); // t2 = System.currentTimeMillis(); // System.out.println("Language detection: " + (t2 - t1)); cr.setLanguage(language); if (language != null && language.equalsIgnoreCase(Mapper.ITALIAN) && posEnabled != null && posEnabled.equalsIgnoreCase("true")) { // PoS tag if it's an italian text // t1 = System.currentTimeMillis(); String posTagResourceFileName = this.createPoSTagResource(plainTextFile, entityManager, cleanText.getEncoding()); // t2 = System.currentTimeMillis(); // System.out.println("PoS tagging: " + (t2 - t1)); if (posTagResourceFileName != null) { Integer wordsNumber = Mapper.countWordsFromPoSTagResource(posTagResourceFileName); cr.setWordsNumber(wordsNumber); } } } } catch (Exception e) { status = Parameter.PROCESSING_ERROR; e.printStackTrace(); } cr.setProcessed(status); } private void createArchivedResourceAndDeleteFromAttic(File f, String oldDigest, CrawledResource cr, EntityManager entityManager) throws SAXException, TikaException, IOException { this.createArchivedResource(f, cr, entityManager); this.removeResourceInAttic(f, oldDigest, entityManager); } private StringWithEncoding createPlainTextResource(File f, CrawledResource cr, EntityManager entityManager) throws SAXException, TikaException, IOException, TransformerConfigurationException, InterruptedException { File resourceDir = new File(FilenameUtils.getFullPath(f.getCanonicalPath().replaceAll("__\\d+", "")) + JobMapperMonitor.RESOURCESDIR); String alchemyKey = entityManager.find(Parameter.class, Parameter.ALCHEMY_KEY.getKey()).getValue(); String readabilityKey = entityManager.find(Parameter.class, Parameter.READABILITY_KEY.getKey()).getValue(); String resourceFileName = cr.getDigest() + ".gz"; File resourceFile = new File(resourceDir, resourceFileName); StringWithEncoding rawContentAndEncoding = null; String contentType = cr.getContentType(); // long t1 = System.currentTimeMillis(); if (contentType != null && contentType.contains("application/msword")) { rawContentAndEncoding = this.transformDOC2HTML(resourceFile, entityManager); } if (contentType != null && contentType.contains("application/rtf")) { rawContentAndEncoding = this.transformRTF2HTML(resourceFile, entityManager); } if (contentType != null && contentType.contains("text/plain")) { // txt -> html -> txt is for txt cleaning rawContentAndEncoding = this.transformTXT2HTML(resourceFile, entityManager); } if (contentType != null && contentType.contains("pdf")) { rawContentAndEncoding = this.transformPDF2HTML(resourceFile, entityManager); } if (contentType != null && contentType.contains("html")) { rawContentAndEncoding = this.getGuessedEncodingAndSetRawContentFromGZFile(resourceFile); } // long t2 = System.currentTimeMillis(); // System.out.println("Transformation: " + (t2 - t1)); if (rawContentAndEncoding != null) { if (rawContentAndEncoding.getEncoding() == null) { rawContentAndEncoding = new StringWithEncoding(rawContentAndEncoding.getString(), "UTF8"); } // t1 = System.currentTimeMillis(); String cleanText = this.replaceUnsupportedChars(rawContentAndEncoding.getString()); rawContentAndEncoding = new StringWithEncoding(cleanText, rawContentAndEncoding.getEncoding()); File tmpFile = File.createTempFile("ridire", null); FileUtils.writeStringToFile(tmpFile, rawContentAndEncoding.getString(), "UTF-8"); String ridireCleanerJar = entityManager .find(CommandParameter.class, CommandParameter.RIDIRE_CLEANER_EXECUTABLE_KEY).getCommandValue(); String host = entityManager.find(Parameter.class, Parameter.READABILITY_HOSTAPP.getKey()).getValue(); CommandLine commandLine = CommandLine .parse("java -Xmx128m -Djava.io.tmpdir=" + this.tempDir + " -jar " + ridireCleanerJar); commandLine.addArgument("-f"); commandLine.addArgument(tmpFile.getPath()); commandLine.addArgument("-e"); commandLine.addArgument("UTF-8"); commandLine.addArgument("-h"); commandLine.addArgument(host); commandLine.addArgument("-k"); commandLine.addArgument(alchemyKey); commandLine.addArgument("-r"); commandLine.addArgument(readabilityKey); DefaultExecutor executor = new DefaultExecutor(); executor.setExitValue(0); ExecuteWatchdog watchdog = new ExecuteWatchdog(Mapper.READABILITY_TIMEOUT); executor.setWatchdog(watchdog); ByteArrayOutputStream baosStdOut = new ByteArrayOutputStream(1024); ByteArrayOutputStream baosStdErr = new ByteArrayOutputStream(1024); ExecuteStreamHandler executeStreamHandler = new PumpStreamHandler(baosStdOut, baosStdErr, null); executor.setStreamHandler(executeStreamHandler); int exitValue = executor.execute(commandLine); if (exitValue == 0) { rawContentAndEncoding = new StringWithEncoding(baosStdOut.toString(), "UTF-8"); // TODO filter real errors rawContentAndEncoding.setCleaner(baosStdErr.toString().trim()); } FileUtils.deleteQuietly(tmpFile); } return rawContentAndEncoding; } private String createPoSTagResource(File plainTextFile, EntityManager entityManager, String encoding) throws InterruptedException, IOException { // this is needed because TreeTagger doesn't handle spaces inside // filenames correctly File tmpFile = File.createTempFile("treetagger", null); FileUtils.copyFile(plainTextFile, tmpFile); String treeTaggerBin = entityManager .find(CommandParameter.class, CommandParameter.TREETAGGER_EXECUTABLE_KEY).getCommandValue(); // if (encoding.equalsIgnoreCase("UTF-8") // || encoding.equalsIgnoreCase("UTF8")) { // treeTaggerBin = entityManager.find(CommandParameter.class, // CommandParameter.TREETAGGER_EXECUTABLE_UTF8_KEY) // .getCommandValue(); // } this.ridireReTagger.setTreetaggerBin(treeTaggerBin); String tmpPoSFile = this.ridireReTagger.retagFile(tmpFile); File newPosFile = new File(plainTextFile.getAbsolutePath() + ".pos"); if (tmpPoSFile != null) { if (newPosFile.exists()) { FileUtils.deleteQuietly(newPosFile); } FileUtils.moveFile(new File(tmpPoSFile), newPosFile); return newPosFile.getAbsolutePath(); } return null; // CommandLine commandLine = CommandLine.parse(treeTaggerBin); // commandLine.addArgument(tmpFile.getPath()); // DefaultExecutor executor = new DefaultExecutor(); // executor.setExitValue(0); // ExecuteWatchdog watchdog = new ExecuteWatchdog( // Mapper.TREETAGGER_TIMEOUT); // executor.setWatchdog(watchdog); // ByteArrayOutputStream baos = new ByteArrayOutputStream(1024); // ExecuteStreamHandler executeStreamHandler = new // PumpStreamHandler(baos, // null, null); // executor.setStreamHandler(executeStreamHandler); // int exitValue = executor.execute(commandLine); // FileUtils.deleteQuietly(tmpFile); // if (exitValue == 0) { // File posTagFile = new File(plainTextFile.getPath() + ".pos"); // FileUtils.writeByteArrayToFile(posTagFile, baos.toByteArray()); // return posTagFile.getCanonicalPath(); // } // return null; } private String detectLanguage(String cleanText) throws IOException { NGramProfiles nps = new NGramProfiles(); NGramProfiles.Ranker ranker = nps.getRanker(); ranker.account(cleanText); NGramProfiles.RankResult res = ranker.getRankResult(); String language = null; if (res != null) { language = res.getName(0); } return language; } private boolean existsResourceWithSameExtractedText(CrawledResource cr) { Long count = (Long) this.em .createQuery( "select count(cr) from CrawledResource cr where cr.extractedTextHash=:md5 and cr.id<>:id") .setParameter("md5", cr.getExtractedTextHash()).setParameter("id", cr.getId()).getSingleResult(); if (count > 0) { return true; } return false; } private StringWithEncoding getGuessedEncodingAndSetRawContentFromGZFile(File resourceFile) throws IOException { byte[] buf = new byte[Mapper.BUFLENGTH]; int count = 0; GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(resourceFile)); ByteArrayOutputStream baos = new ByteArrayOutputStream(); while ((count = gzis.read(buf)) != -1) { baos.write(buf, 0, count); } gzis.close(); baos.close(); CharsetDetector charsetDetector = new CharsetDetector(); byte[] byteArray = baos.toByteArray(); charsetDetector.setText(byteArray); CharsetMatch[] matches = charsetDetector.detectAll(); String encoding = this.allowedCharsets.get(1); for (CharsetMatch cm : matches) { if (this.allowedCharsets.contains(cm.getName())) { encoding = cm.getName(); // System.out.println(encoding); break; } } String rawContent = new String(byteArray, encoding); return new StringWithEncoding(rawContent, encoding); } public Integer getJobId() { return this.jobId; } public String getJobName() { if (this.job != null) { return this.job.getName(); } return "" + this.jobId; } /** * Returns a transformer handler that serializes incoming SAX events to * XHTML or HTML (depending the given method) using the given output * encoding. * * @see <a * href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a> * @param method * "xml" or "html" * @param encoding * output encoding, or <code>null</code> for the platform default * @param writer * @return {@link System#out} transformer handler * @throws TransformerConfigurationException * if the transformer can not be created */ private TransformerHandler getTransformerHandler(String method, String encoding, Writer writer) throws TransformerConfigurationException { SAXTransformerFactory factory = (SAXTransformerFactory) TransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); if (encoding != null) { handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, encoding); } handler.setResult(new StreamResult(writer)); return handler; } private boolean isResourceAlreadyMapped(String digest, String url, EntityManager entityManager) { boolean ret = entityManager.createQuery("from CrawledResource cr where cr.digest=:digest or cr.url=:url") .setParameter("digest", digest).setParameter("url", url).getResultList().size() > 0 ? true : false; return ret; } public boolean isRunning() { return this.running; } private void lookForNoMoreAvailableResources(Job persistedJob, Set<CrawledResource> childResources, File f, EntityManager entityManager) throws IOException, NotSupportedException, SystemException, SecurityException, IllegalStateException, RollbackException, HeuristicMixedException, HeuristicRollbackException { for (CrawledResource cr : persistedJob.getCrawledResources()) { if (!childResources.contains(cr)) { cr.setNoMoreAvailable(true); entityManager.persist(cr); this.moveResourceInAttic(cr, f); } } } private void moveResourceInAttic(CrawledResource cr, File f) throws IOException { File atticDir = new File( FilenameUtils.getFullPath(f.getCanonicalPath().replaceAll("__\\d+", "")) + Mapper.ATTICDIR); File resourceDir = new File(FilenameUtils.getFullPath(f.getCanonicalPath().replaceAll("__\\d+", "")) + JobMapperMonitor.RESOURCESDIR); File toBeMoved = new File(resourceDir, cr.getDigest() + ".gz"); FileUtils.moveFileToDirectory(toBeMoved, atticDir, true); toBeMoved = new File(resourceDir, cr.getDigest() + ".txt"); if (toBeMoved != null) { FileUtils.moveFileToDirectory(toBeMoved, atticDir, true); } // System.out.println("Resource moved in attic: " + cr.getUrl()); } private void removeGZippedResource(File resourceDir, String resourceFile) { FileUtils.deleteQuietly( new File(resourceDir.getAbsolutePath() + System.getProperty("file.separator") + resourceFile)); } private void removeModifiedArchivedResource(File f, String oldDigest, EntityManager entityManager) throws IOException { this.removeResource(f, oldDigest, false, entityManager); } private void removeResource(File f, String oldDigest, boolean attic, EntityManager entityManager) throws IOException { File resourceDir = null; if (attic) { resourceDir = new File( FilenameUtils.getFullPath(f.getCanonicalPath().replaceAll("__\\d+", "")) + Mapper.ATTICDIR); } else { resourceDir = new File(FilenameUtils.getFullPath(f.getCanonicalPath().replaceAll("__\\d+", "")) + JobMapperMonitor.RESOURCESDIR); } String resourceFile = oldDigest + ".gz"; File toBeDeleted = new File(resourceDir, resourceFile); if (toBeDeleted.exists() && toBeDeleted.canWrite()) { toBeDeleted.delete(); } String txtResourceFile = oldDigest + ".txt"; toBeDeleted = new File(resourceDir, txtResourceFile); if (toBeDeleted.exists() && toBeDeleted.canWrite()) { toBeDeleted.delete(); } } private void removeResourceInAttic(File f, String oldDigest, EntityManager entityManager) throws IOException { this.removeResource(f, oldDigest, true, entityManager); } /** * This method is to deal with MS special and non standard chars * * @param cleanText * @return */ private String replaceUnsupportedChars(String cleanText) { cleanText = cleanText.replaceAll("", "'"); cleanText = cleanText.replaceAll("", "'"); cleanText = cleanText.replaceAll("?", "\""); cleanText = cleanText.replaceAll("", "\""); cleanText = cleanText.replaceAll("", "-"); cleanText = cleanText.replaceAll("", "..."); return cleanText; } public void run() { Random random = new Random(); int waitSec = random.nextInt(100); this.setRunning(true); Lifecycle.beginCall(); this.mapperUserTx = (UserTransaction) org.jboss.seam.Component .getInstance("org.jboss.seam.transaction.transaction", ScopeType.CONVERSATION); this.em = (EntityManager) Component.getInstance("eventEntityManager"); this.tempDir = this.em.find(Parameter.class, Parameter.TEMP_DIR.getKey()).getValue(); try { this.mapperUserTx.setTransactionTimeout(60 * 20); Thread.sleep(waitSec); if (this.mapperUserTx != null && !this.mapperUserTx.isActive()) { if (this.mapperUserTx.getStatus() != javax.transaction.Status.STATUS_ACTIVE) { this.mapperUserTx.begin(); } } this.em.joinTransaction(); String dir = this.em.find(Parameter.class, Parameter.JOBS_DIR.getKey()).getValue(); File filename = null; boolean childJobMapping = false; Job persistedJob = this.em.find(Job.class, this.jobId); if (persistedJob.getChildJobName() != null && !persistedJob.isMappedResources()) { childJobMapping = true; filename = new File(dir + JobMapperMonitor.FILE_SEPARATOR + persistedJob.getChildJobName() + JobMapperMonitor.FILE_SEPARATOR + "arcs" + JobMapperMonitor.FILE_SEPARATOR); } else { filename = new File(dir + JobMapperMonitor.FILE_SEPARATOR + persistedJob.getName() + JobMapperMonitor.FILE_SEPARATOR + "arcs" + JobMapperMonitor.FILE_SEPARATOR); } this.em.flush(); this.mapperUserTx.commit(); File[] arcFiles = filename.listFiles(); if (arcFiles == null) { // try also 'completed-' for back compatibility filename = new File(dir + JobMapperMonitor.FILE_SEPARATOR + "completed-" + persistedJob.getName() + JobMapperMonitor.FILE_SEPARATOR + "arcs" + JobMapperMonitor.FILE_SEPARATOR); arcFiles = filename.listFiles(); } else { for (File f : arcFiles) { if (f.getName().equals("resources")) { continue; } boolean uncompressed = false; if (f.getName().endsWith(".gz")) { f = this.uncompressGzippedArcFile(f); uncompressed = true; } ArchiveReader archiveReader; try { archiveReader = ArchiveReaderFactory.get(f); } catch (IOException e) { System.err.println("Errore nella lettura del file: " + f.getAbsolutePath()); e.printStackTrace(); continue; } Iterator<ArchiveRecord> itOnArchiveRecord = archiveReader.iterator(); Set<CrawledResource> childResources = new HashSet<CrawledResource>(); while (itOnArchiveRecord.hasNext()) { ARCRecord archiveRecord = (ARCRecord) itOnArchiveRecord.next(); ARCRecordMetaData metadata = archiveRecord.getMetaData(); Date now = new Date(); // store only succeeded requests if (metadata.getStatusCode() != null && metadata.getStatusCode().equals("200")) { String url = metadata.getUrl(); long length = metadata.getLength(); System.out.println( "Mapping URL: " + url + "\n" + "Size: " + length + "\t" + this.mapperUserTx); try { archiveRecord.skipHttpHeader(); archiveRecord.close(); String digest = archiveRecord.getDigestStr(); if (!childJobMapping) { // do not store equal resources if (this.mapperUserTx != null && !this.mapperUserTx.isActive()) { if (this.mapperUserTx .getStatus() != javax.transaction.Status.STATUS_ACTIVE) { this.mapperUserTx.begin(); } } this.em.joinTransaction(); if (!this.isResourceAlreadyMapped(digest, url, this.em)) { CrawledResource cr = new CrawledResource(); cr.setDigest(digest); cr.setOffset(metadata.getOffset()); cr.setArcFile(f.getCanonicalPath()); cr.setArchiveDate(now); cr.setLastModified(now); cr.setLength(length); cr.setContentType(metadata.getMimetype()); cr.setIp(metadata.getIp()); cr.setUrl(url); cr.setJob(persistedJob); persistedJob = this.em.find(Job.class, this.jobId); persistedJob.getCrawledResources().add(cr); this.em.persist(cr); this.em.persist(persistedJob); this.createArchivedResource(f, cr, this.em); this.em.persist(cr); if (this.existsResourceWithSameExtractedText(cr)) { persistedJob.getCrawledResources().remove(cr); this.em.persist(persistedJob); this.em.remove(cr); } } this.em.flush(); this.mapperUserTx.commit(); } else { CrawledResource cr = new CrawledResource(); cr.setUrl(url); if (this.mapperUserTx != null && !this.mapperUserTx.isActive()) { if (this.mapperUserTx .getStatus() != javax.transaction.Status.STATUS_ACTIVE) { this.mapperUserTx.begin(); } } this.em.joinTransaction(); // do not store equal resources if (!this.isResourceAlreadyMapped(digest, url, this.em)) { // check urls CrawledResource sameUrlResource = this.sameURLExists(url, persistedJob, this.em); boolean modifiedResource = false; String oldDigest = null; if (sameUrlResource != null) { // modified resource modifiedResource = true; oldDigest = sameUrlResource.getDigest(); cr = sameUrlResource; } else { // new resource cr.setArchiveDate(now); } cr.setDigest(digest); cr.setNoMoreAvailable(false); cr.setOffset(metadata.getOffset()); cr.setLastModified(now); cr.setArcFile(f.getCanonicalPath()); cr.setLength(length); cr.setContentType(metadata.getMimetype()); cr.setIp(metadata.getIp()); cr.setJob(persistedJob); persistedJob = this.em.find(Job.class, this.jobId); persistedJob.getCrawledResources().add(cr); this.em.persist(cr); this.em.persist(persistedJob); if (modifiedResource) { this.updateArchivedResource(f, cr, oldDigest, this.em); this.em.persist(cr); } else { this.createArchivedResource(f, cr, this.em); if (this.existsResourceWithSameExtractedText(cr)) { persistedJob.getCrawledResources().remove(cr); this.em.persist(persistedJob); this.em.remove(cr); } } } else { // this is an url that was // marked as // noMoreAvailable and now it is // available // again } CrawledResource sameUrlResource = this.sameURLExists(url, persistedJob, this.em); if (sameUrlResource != null && sameUrlResource.isNoMoreAvailable()) { String oldDigest = sameUrlResource.getDigest(); cr = sameUrlResource; cr.setDigest(digest); cr.setNoMoreAvailable(false); cr.setOffset(metadata.getOffset()); cr.setLastModified(now); cr.setArcFile(f.getCanonicalPath()); cr.setLength(length); cr.setContentType(metadata.getMimetype()); cr.setIp(metadata.getIp()); cr.setJob(persistedJob); persistedJob = this.em.find(Job.class, this.jobId); persistedJob.getCrawledResources().add(cr); this.em.persist(cr); this.em.persist(persistedJob); this.createArchivedResourceAndDeleteFromAttic(f, oldDigest, cr, this.em); this.em.persist(cr); } childResources.add(cr); this.em.flush(); this.mapperUserTx.commit(); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { try { if (this.mapperUserTx != null && this.mapperUserTx.isActive()) { this.mapperUserTx.rollback(); } } catch (IllegalStateException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (SecurityException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (SystemException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } this.setRunning(false); } } } archiveReader.close(); if (uncompressed) { FileUtils.deleteQuietly(f); } if (childJobMapping) { if (this.mapperUserTx != null && !this.mapperUserTx.isActive()) { if (this.mapperUserTx.getStatus() != javax.transaction.Status.STATUS_ACTIVE) { this.mapperUserTx.begin(); } } this.em.joinTransaction(); this.lookForNoMoreAvailableResources(persistedJob, childResources, f, this.em); this.em.flush(); this.mapperUserTx.commit(); } } } if (this.mapperUserTx != null && !this.mapperUserTx.isActive()) { if (this.mapperUserTx.getStatus() != javax.transaction.Status.STATUS_ACTIVE) { this.mapperUserTx.begin(); } } this.em.joinTransaction(); this.em.refresh(persistedJob); persistedJob.setMappedResources(true); this.em.persist(persistedJob); this.em.flush(); this.mapperUserTx.commit(); this.flagBearer.setOwnerName(persistedJob.getCrawlerUser().getName()); this.flagBearer.setOwnerSurname(persistedJob.getCrawlerUser().getSurname()); this.flagBearer.setJobName(persistedJob.getName()); this.flagBearer.setEmailAddress(persistedJob.getCrawlerUser().getEmail()); if (this.flagBearer.getEmailAddress() != null && this.flagBearer.getEmailAddress().trim().length() > 0) { this.renderer.render("/mail/mappedJob.xhtml"); } this.setRunning(false); System.out.println("Job " + this.getJobName() + " mapping terminated."); } catch (Exception e) { if (this.mapperUserTx != null) { try { System.err.println("STATUS: " + this.mapperUserTx.getStatus()); } catch (SystemException e1) { System.err.println("SSTACKTRACE mapperUserTx"); e1.printStackTrace(); } } e.printStackTrace(); } finally { try { if (this.mapperUserTx != null && this.mapperUserTx.isActive()) { this.mapperUserTx.rollback(); } } catch (Exception e) { System.err.println("FINALLY"); e.printStackTrace(); } Lifecycle.endCall(); this.setRunning(false); } } @SuppressWarnings("unchecked") private CrawledResource sameURLExists(String url, Job persistedJob, EntityManager entityManager) { List<CrawledResource> listCR = entityManager.createQuery("from CrawledResource cr where cr.url=:url") .setParameter("url", url).getResultList(); if (listCR.size() == 1) { return listCR.get(0); } return null; } public void setRunning(boolean running) { this.running = running; } private StringWithEncoding transformDOC2HTML(File resourceFile, EntityManager entityManager) throws IOException, SAXException, TikaException, TransformerConfigurationException { ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); context.set(Parser.class, parser); Metadata metadata = new Metadata(); Writer writer = null; if (resourceFile.isFile()) { metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceFile.getName()); InputStream input = new FileInputStream(resourceFile); try { writer = new StringWriter(); parser.parse(input, this.HTML.getContentHandler(null, writer), metadata, context); } finally { input.close(); if (writer != null) { writer.close(); } } CharsetDetector charsetDetector = new CharsetDetector(); charsetDetector.setText(writer.toString().getBytes()); String encoding = charsetDetector.detect().getName(); StringWithEncoding stringWithEncoding = new StringWithEncoding(writer.toString(), encoding); return stringWithEncoding; } return null; } @SuppressWarnings("unchecked") private StringWithEncoding transformPDF2HTML(File resourceFile, EntityManager entityManager) throws IOException, InterruptedException { String workingDirName = System.getProperty("java.io.tmpdir"); String userDir = System.getProperty("user.dir"); byte[] buf = new byte[Mapper.BUFLENGTH]; int count = 0; GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(resourceFile)); ByteArrayOutputStream baos = new ByteArrayOutputStream(); while ((count = gzis.read(buf)) != -1) { baos.write(buf, 0, count); } gzis.close(); baos.close(); byte[] byteArray = baos.toByteArray(); String uuid = UUID.randomUUID().toString(); String pdfFileName = uuid + ".pdf"; String htmlFileName = uuid + ".html"; File tmpDir = new File(workingDirName); String htmlFileNameCompletePath = workingDirName + JobMapperMonitor.FILE_SEPARATOR + htmlFileName; File fileToConvert = new File(tmpDir, pdfFileName); FileUtils.writeByteArrayToFile(fileToConvert, byteArray); DefaultExecutor executor = new DefaultExecutor(); executor.setExitValue(0); CommandParameter cp = entityManager.find(CommandParameter.class, CommandParameter.PDFTOHTML_EXECUTABLE_KEY); CommandLine commandLine = CommandLine.parse(cp.getCommandValue()); commandLine.addArgument("-c"); commandLine.addArgument("-i"); commandLine.addArgument(fileToConvert.getAbsolutePath()); commandLine.addArgument(htmlFileNameCompletePath); executor.setExitValue(0); executor.execute(commandLine); try { FileUtils.moveFileToDirectory( new File(userDir + JobMapperMonitor.FILE_SEPARATOR + uuid + "-outline.html"), tmpDir, false); } catch (IOException e) { } cp = entityManager.find(CommandParameter.class, CommandParameter.PDFCLEANER_EXECUTABLE_KEY); commandLine = CommandLine .parse("java -Xmx128m -jar -Djava.io.tmpdir=" + this.tempDir + " " + cp.getCommandValue()); commandLine.addArgument(htmlFileNameCompletePath); commandLine.addArgument("39"); commandLine.addArgument("6"); commandLine.addArgument("5"); executor = new DefaultExecutor(); executor.setExitValue(0); ExecuteWatchdog watchdog = new ExecuteWatchdog(Mapper.PDFCLEANER_TIMEOUT); executor.setWatchdog(watchdog); ByteArrayOutputStream baosStdOut = new ByteArrayOutputStream(1024); ExecuteStreamHandler executeStreamHandler = new PumpStreamHandler(baosStdOut, null, null); executor.setStreamHandler(executeStreamHandler); int exitValue = executor.execute(commandLine); String htmlString = null; if (exitValue == 0) { htmlString = baosStdOut.toString(); } FileUtils.deleteQuietly(new File(htmlFileNameCompletePath)); PrefixFileFilter pff = new PrefixFileFilter(uuid); for (File f : FileUtils.listFiles(tmpDir, pff, null)) { FileUtils.deleteQuietly(f); } if (htmlString != null) { htmlString = htmlString.replaceAll(" ", " "); htmlString = htmlString.replaceAll("<br.*?>", " "); CharsetDetector charsetDetector = new CharsetDetector(); charsetDetector.setText(htmlString.getBytes()); String encoding = charsetDetector.detect().getName(); return new StringWithEncoding(htmlString, encoding); } return null; } private StringWithEncoding transformRTF2HTML(File resourceFile, EntityManager entityManager) throws IOException, SAXException, TikaException, TransformerConfigurationException { // AutoDetectParser works for rtf as for doc return this.transformDOC2HTML(resourceFile, entityManager); } private StringWithEncoding transformTXT2HTML(File resourceFile, EntityManager entityManager) throws IOException, SAXException, TikaException, TransformerConfigurationException { // AutoDetectParser works for txt as for doc return this.transformDOC2HTML(resourceFile, entityManager); } private File uncompressGzippedArcFile(File f) throws IOException { FileInputStream fin = new FileInputStream(f.getAbsolutePath()); BufferedInputStream in = new BufferedInputStream(fin); File uncompressedFile = new File(FilenameUtils.removeExtension(f.getAbsolutePath())); FileOutputStream out = new FileOutputStream(uncompressedFile); GzipCompressorInputStream gzIn = new GzipCompressorInputStream(in, true); final byte[] buffer = new byte[1024]; int n = 0; while (-1 != (n = gzIn.read(buffer))) { out.write(buffer, 0, n); } out.close(); gzIn.close(); return uncompressedFile; } private void updateArchivedResource(File f, CrawledResource cr, String oldDigest, EntityManager entityManager) throws SAXException, TikaException, IOException { this.createArchivedResource(f, cr, entityManager); this.removeModifiedArchivedResource(f, oldDigest, entityManager); } }