Java tutorial
/******************************************************************************* * Copyright 2013 Universit degli Studi di Firenze * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package it.drwolf.ridire.session; import it.drwolf.ridire.entity.CrawledResource; import it.drwolf.ridire.entity.FunctionalMetadatum; import it.drwolf.ridire.entity.Job; import it.drwolf.ridire.entity.Parameter; import it.drwolf.ridire.entity.ScheduledJobHandle; import it.drwolf.ridire.entity.SemanticMetadatum; import it.drwolf.ridire.entity.User; import it.drwolf.ridire.session.async.JobMapperMonitor; import it.drwolf.ridire.session.async.ScheduledJobExecutor; import it.drwolf.ridire.util.MD5DigestCreator; import it.drwolf.ridire.util.data.PoSLine; import it.drwolf.ridire.util.exceptions.CrawlingFileException; import it.drwolf.ridire.util.exceptions.FileHandlingException; import it.drwolf.ridire.util.exceptions.HeritrixException; import it.drwolf.ridire.util.exceptions.JobHandlingException; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.math.BigDecimal; import java.security.NoSuchAlgorithmException; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import javax.faces.model.SelectItem; import javax.persistence.EntityManager; import javax.persistence.Query; import javax.transaction.SystemException; import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.text.StrTokenizer; import org.archive.crawler.framework.CrawlStatus; import org.hibernate.Session; import org.jboss.seam.ScopeType; import org.jboss.seam.annotations.Create; import org.jboss.seam.annotations.Factory; import org.jboss.seam.annotations.In; import org.jboss.seam.annotations.Name; import org.jboss.seam.annotations.Scope; import org.jboss.seam.annotations.Synchronized; import org.jboss.seam.annotations.Transactional; import org.jboss.seam.annotations.async.Asynchronous; import org.jboss.seam.async.QuartzTriggerHandle; import org.jboss.seam.contexts.Contexts; import org.jboss.seam.core.Conversation; import org.jboss.seam.security.Identity; import org.jboss.seam.transaction.UserTransaction; import org.quartz.SchedulerException; @Scope(ScopeType.CONVERSATION) @Name("jobManager") @Synchronized public class JobManager { final private static DecimalFormat DECIMAL_FORMAT = new DecimalFormat("0.00"); final private static int MAX_FILE_SIZE = 100 * 1024 * 1024; // private static final int PAGE_SIZE = 10; @In(create = true) private CrawlerManager crawlerManager; @In(create = true) private Map<String, String> messages; @In private Identity identity; @In(create = true) private ScheduledJobExecutor scheduledJobExecutor; @In private EntityManager entityManager; @In(required = false) private Profile profile; private Job job; private String filterNameValue; List<Job> jobs; private Integer functionalMetadatum; private Integer semanticMetadatum; private Integer functionalMetadatumValue; private Integer semanticMetadatumValue; private String pollInterval = "30000"; // default 30 sec. private String filterUserValue; private String filterURLValue; private Integer filterStatusValue; private Integer filterResStatusValue; private String filterMimeTypeValue; public Integer firstResult; // parameter used to delete resources with a lower words number private Integer wordsLimitDeletionNumber = 40; private int pageSize = 10; private final Map<String, Integer> allStatusMap = new LinkedHashMap<String, Integer>() { /** * */ private static final long serialVersionUID = -5866892546719358941L; { this.put("", -1); this.put("RUNNING", 1); this.put("FINISHED", 0); this.put("CREATED", 2); } }; private final Map<String, Integer> allResStatusMap = new LinkedHashMap<String, Integer>() { /** * */ private static final long serialVersionUID = -2832735400156804117L; { this.put("", -1); this.put("Da mappare", 2); this.put("Mappatura in corso", 3); this.put("Mappate", 0); this.put("Non presenti", 1); } }; private List<CrawledResource> results; private Long totalResults; private String cleanedText; private List<PoSLine> posText = new ArrayList<PoSLine>(); private static final String RESOURCESDIR = "resources/"; // private static final String NO_POS_AVAILABLE = // "Non esiste il PoS tagging per la risorsa."; private String sortField; private String sortOrder; private List<String> languagesToBeDeleted = new ArrayList<String>(); private Map<Integer, Integer> analyzedResourcesPercentage = new HashMap<Integer, Integer>(); private boolean filterAnalyzed; private boolean filterValidated; private Long totalSelectedResourcesNumber; private UserTransaction userTx; private boolean validationToBeSaved = false; // @In(create = true) // private AsyncIndexer asyncIndexer; private String perlPw; private String cleanerPath; private String perlUser; private String perlCleanerTemplate; private String testAfter; private String testBefore; private String testOutput; public String applyValidationData() { for (CrawledResource cr : this.results) { this.entityManager.merge(cr); } if (this.getValidatedResourcesNumber() == 0) { this.job.setValidationStatus(Job.TO_BE_VALIDATED); } else if (this.getValidatedResourcesNumber() < this.getTotalSelectedResourcesNumber()) { this.job.setValidationStatus(Job.VALIDATION_IN_PROGRESS); } else { this.job.setValidationStatus(this.calculateValidationResult()); } this.entityManager.merge(this.job); this.setValidationToBeSaved(false); return "OK"; } public void assignMetadata() { for (CrawledResource cr : this.job.getCrawledResources()) { if (cr.isChecked()) { FunctionalMetadatum fm = this.entityManager.find(FunctionalMetadatum.class, this.functionalMetadatum); cr.setFunctionalMetadatum(fm); SemanticMetadatum sm = this.entityManager.find(SemanticMetadatum.class, this.semanticMetadatum); cr.setSemanticMetadatum(sm); cr.setChecked(false); if (cr.getProcessed().equals(Parameter.INDEXED)) { cr.setProcessed(Parameter.INDEXING_PHASE); this.entityManager.merge(cr); // this.asyncIndexer.updateSingleCrawledResource(cr.getId(), // true); } } } } private String calculateCronString() { String cronString = ""; Calendar startDate = Calendar.getInstance(); if (this.job.getFirstDate() == null) { return cronString; } startDate.setTime(this.job.getFirstDate()); if (this.job.getPeriodFrequency().equals("yearly")) { cronString = startDate.get(Calendar.SECOND) + " " + startDate.get(Calendar.MINUTE) + " " + startDate.get(Calendar.HOUR_OF_DAY) + " " + startDate.get(Calendar.DAY_OF_MONTH) + " " + startDate.get(Calendar.MONTH) + 1 + " ?"; } else if (this.job.getPeriodFrequency().equals("monthly")) { cronString = startDate.get(Calendar.SECOND) + " " + startDate.get(Calendar.MINUTE) + " " + startDate.get(Calendar.HOUR_OF_DAY) + " " + startDate.get(Calendar.DAY_OF_MONTH) + " * ?"; } else if (this.job.getPeriodFrequency().equals("weekly")) { cronString = startDate.get(Calendar.SECOND) + " " + startDate.get(Calendar.MINUTE) + " " + startDate.get(Calendar.HOUR_OF_DAY) + " ? * " + startDate.get(Calendar.DAY_OF_WEEK); } else if (this.job.getPeriodFrequency().equals("daily")) { cronString = startDate.get(Calendar.SECOND) + " " + startDate.get(Calendar.MINUTE) + " " + startDate.get(Calendar.HOUR_OF_DAY) + " * * ?"; } else if (this.job.getPeriodFrequency().equals("dummy5")) { cronString = startDate.get(Calendar.SECOND) + " " + startDate.get(Calendar.MINUTE) + "/5 " + "* * * ?"; } else if (this.job.getPeriodFrequency().equals("dummy2")) { cronString = startDate.get(Calendar.SECOND) + " " + startDate.get(Calendar.MINUTE) + "/2 " + "* * * ?"; } else if (this.job.getPeriodFrequency().equals("dummy1")) { cronString = startDate.get(Calendar.SECOND) + " " + startDate.get(Calendar.MINUTE) + "/1 " + "* * * ?"; } return cronString; } private Integer calculateValidationResult() { Long notValidResults = (Long) this.entityManager .createQuery("select count(cr) from CrawledResource cr where cr.job=:j and cr.validationStatus=" + CrawledResource.NOT_VALID_RESOURCE + " and cr.validation=" + CrawledResource.VALIDATED) .setParameter("j", this.getJob()).getSingleResult(); if (notValidResults / this.getTotalSelectedResourcesNumber().floatValue() * 100 >= this.getJob() .getValidationThreshold()) { return Job.VALIDATED_NOK; } else { return Job.VALIDATED_OK; } } public String canCreateJob() { long quota = this.crawlerManager.calculateQuota(this.getCurrentUser()); if (quota > 0L) { return "OK"; } return "NOK"; } public void checkAll() { for (CrawledResource cr : this.job.getCrawledResources()) { cr.setChecked(true); } } public void checkNothing() { for (CrawledResource cr : this.job.getCrawledResources()) { cr.setChecked(false); } } public void deleteAllInPage() { for (CrawledResource cr : this.results) { cr.setDeleted(true); this.entityManager.merge(cr); } } public String deleteJob() throws HeritrixException { String ret = this.crawlerManager.deleteJob(this.job.getName(), this.job.getJobStage(), this.getCurrentUser()); // this.asyncIndexer.deleteJobResources(this.job.getId(), false, null); return ret; } public void deleteResourcesByLanguage() { if (this.getLanguagesToBeDeleted() != null && this.getLanguagesToBeDeleted().size() > 0) { List<Integer> crIds = this.entityManager.createQuery( "select cr.id from CrawledResource cr where cr.job=:job and cr.language in (:languages) and cr.processed=:status ") .setParameter("job", this.job).setParameter("status", Parameter.FINISHED) .setParameter("languages", this.getLanguagesToBeDeleted()).getResultList(); String queryString = "update CrawledResource cr set cr.deleted=true, cr.processed=:status where cr.job=:job and cr.language in (:languages)"; int deletedResource = this.entityManager.createQuery(queryString).setParameter("job", this.job) .setParameter("languages", this.getLanguagesToBeDeleted()) .setParameter("status", Parameter.FINISHED).executeUpdate(); // for (Integer crId : crIds) { // this.asyncIndexer.deleteSingleCrawledResource(crId.intValue()); // } Conversation.instance().endBeforeRedirect(); } } public void deleteResourcesWithFewWords() { if (this.getWordsLimitDeletionNumber() != null && this.getWordsLimitDeletionNumber() > 0) { List<Integer> crIds = this.entityManager.createQuery( "select cr.id from CrawledResource cr where cr.job=:job and cr.wordsNumber < :wn and cr.processed=:status ") .setParameter("job", this.job).setParameter("status", Parameter.FINISHED) .setParameter("wn", this.getWordsLimitDeletionNumber()).getResultList(); String queryString = "update CrawledResource cr set cr.processed=:status, cr.deleted=true where cr.job=:job and cr.wordsNumber < :wn"; int deletedResource = this.entityManager.createQuery(queryString).setParameter("job", this.job) .setParameter("wn", this.getWordsLimitDeletionNumber()) .setParameter("status", Parameter.FINISHED).executeUpdate(); // for (Integer crId : crIds) { // this.asyncIndexer.deleteSingleCrawledResource(crId.intValue()); // } Conversation.instance().endBeforeRedirect(); } } public boolean filterFunctionalMetadatum(Object current) { if (this.functionalMetadatumValue == null || this.functionalMetadatumValue < 0) { return true; } CrawledResource cr = (CrawledResource) current; if (cr.getFunctionalMetadatum() != null && cr.getFunctionalMetadatum().getId().equals(this.functionalMetadatumValue)) { return true; } return false; } public boolean filterMimeTypes(Object current) { if (this.filterMimeTypeValue == null || this.filterMimeTypeValue.length() == 0) { return true; } CrawledResource cr = (CrawledResource) current; if (cr.getContentType() != null && cr.getContentType().toLowerCase().contains(this.filterMimeTypeValue.toLowerCase())) { return true; } return false; } public boolean filterNames(Object current) { if (this.filterNameValue == null || this.filterNameValue.length() == 0) { return true; } Job j = (Job) current; if (j.getName().toLowerCase().contains(this.filterNameValue.toLowerCase())) { return true; } return false; } public boolean filterResStatus(Object current) { if (this.filterResStatusValue == null || this.filterResStatusValue < 0) { return true; } Job j = (Job) current; if (j.getJobStage() == null && this.filterResStatusValue == 1) { return true; } if (j.getJobStage() != null && j.isMappedResources() && this.filterResStatusValue == 0) { return true; } if (j.getJobStage() != null && !j.isMappedResources() && (j.getCrawledResources() == null || j.getCrawledResources().size() == 0) && this.filterResStatusValue == 2) { return true; } if (j.getJobStage() != null && !j.isMappedResources() && j.getCrawledResources() != null && j.getCrawledResources().size() != 0 && this.filterResStatusValue == 3) { return true; } return false; } public boolean filterSemanticMetadatum(Object current) { if (this.semanticMetadatumValue == null || this.semanticMetadatumValue < 0) { return true; } CrawledResource cr = (CrawledResource) current; if (cr.getSemanticMetadatum() != null && cr.getSemanticMetadatum().getId().equals(this.semanticMetadatumValue)) { return true; } return false; } public boolean filterStatus(Object current) { if (this.filterStatusValue == null || this.filterStatusValue < 0) { return true; } Job j = (Job) current; Integer statusId = this.allStatusMap.get(j.getJobStage()); if (statusId == null || statusId.equals(this.filterStatusValue)) { return true; } return false; } public boolean filterUsers(Object current) { if (this.filterUserValue == null || this.filterUserValue.length() == 0) { return true; } Job j = (Job) current; if (j.getCrawlerUser() != null && j.getCrawlerUser().getUsername() != null && j.getCrawlerUser().getUsername().contains(this.filterUserValue.toLowerCase())) { return true; } return false; } @SuppressWarnings("unchecked") public List<FunctionalMetadatum> getAllFunctionalMetadata() { return this.entityManager.createQuery("from FunctionalMetadatum m order by m.description").getResultList(); } public Map<String, Integer> getAllResStatusMap() { return this.allResStatusMap; } @SuppressWarnings("unchecked") public List<SemanticMetadatum> getAllSemanticMetadata() { return this.entityManager.createQuery("from SemanticMetadatum m order by m.description").getResultList(); } public Map<String, Integer> getAllStatusMap() { return this.allStatusMap; } public float getAnalyzedResourcesPercentage(Job j) { int res = 0; if (!this.analyzedResourcesPercentage.containsKey(j.getId())) { BigDecimal percentage = null; if (this.isSemanticCrawlerUser(j.getCrawlerUser())) { percentage = (BigDecimal) this.entityManager .createNativeQuery( "SELECT count(cr.id)/(SELECT count(cr2.id) FROM CrawledResource cr2 where cr2.job_id=:job1) FROM CrawledResource cr where cr.job_id=:job2 and (cr.semanticMetadatum_id is not null or cr.deleted=true) ") .setParameter("job1", j.getId()).setParameter("job2", j.getId()).getSingleResult(); } else { percentage = (BigDecimal) this.entityManager.createNativeQuery( "SELECT count(cr.id)/(SELECT count(cr2.id) FROM CrawledResource cr2 where cr2.job_id=:job1) FROM CrawledResource cr where cr.job_id=:job2 and (cr.functionalMetadatum_id is not null or cr.deleted=true) ") .setParameter("job1", j.getId()).setParameter("job2", j.getId()).getSingleResult(); } if (percentage == null) { percentage = new BigDecimal(0); } this.analyzedResourcesPercentage.put(j.getId(), Math.round(percentage.floatValue() * 10000)); } res = this.analyzedResourcesPercentage.get(j.getId()); return res; } public String getCleanedText() { return this.cleanedText; } public String getCompletedResources() { Long finished = (Long) this.entityManager .createQuery("select count(*) from CrawledResource cr where cr.job=:job and cr.processed=:finished") .setParameter("job", this.job).setParameter("finished", Parameter.FINISHED).getSingleResult(); if (finished == 0L) { return "0"; } return Long.toString(finished); } @SuppressWarnings("unchecked") @Factory("crawledResourcesRoots") public CrawledResource[] getCrawledResourcesRoots() { List<CrawledResource> roots = new ArrayList<CrawledResource>(); String queryString = "select cr from CrawledResource cr where cr.job=:job order by cr.url"; List<CrawledResource> orderedResources = this.entityManager.createQuery(queryString) .setParameter("job", this.job).getResultList(); CrawledResource lastVisitedResource = null; for (CrawledResource cr : orderedResources) { if (lastVisitedResource == null) { roots.add(cr); } else { CrawledResource curFather = lastVisitedResource; while (curFather != null) { if (cr.getUrl().contains(curFather.getUrl())) { curFather.getChildren().add(cr); cr.setFather(curFather); break; } curFather = curFather.getFather(); } if (curFather == null) { roots.add(cr); } } lastVisitedResource = cr; } CrawledResource[] array = roots.toArray(new CrawledResource[0]); return array; } public List<SelectItem> getCurrentLanguages() { List<SelectItem> resultList = new ArrayList<SelectItem>(); String queryString = "select distinct(cr.language) from CrawledResource cr where cr.job=:job"; Query query = this.entityManager.createQuery(queryString).setParameter("job", this.job); List<String> curLanguages = query.getResultList(); if (curLanguages == null || curLanguages.size() < 1 || curLanguages.size() == 1 && curLanguages.get(0) == null) { SelectItem si = new SelectItem(""); resultList.add(si); } for (String lang : curLanguages) { if (lang != null) { resultList.add(new SelectItem(lang)); } } return resultList; } @SuppressWarnings("unchecked") private User getCurrentUser() { List<User> users = this.entityManager.createQuery("from User u where u.username=:username") .setParameter("username", this.identity.getCredentials().getUsername()).getResultList(); if (users.size() == 1) { return users.get(0); } return null; } public Integer getDetachedJobId() { if (this.job != null) { return this.job.getId(); } return null; } public Long getDiscoveredURICount() throws HeritrixException, CrawlingFileException { return this.crawlerManager.getDiscoveredURICount(this.job, this.getCurrentUser()); } public String getFilterMimeTypeValue() { return this.filterMimeTypeValue; } public String getFilterNameValue() { return this.filterNameValue; } public Integer getFilterResStatusValue() { return this.filterResStatusValue; } public Integer getFilterStatusValue() { return this.filterStatusValue; } public String getFilterURLValue() { return this.filterURLValue; } public String getFilterUserValue() { return this.filterUserValue; } public Long getFinishedURICount() throws HeritrixException, CrawlingFileException { return this.crawlerManager.getFinishedURICount(this.job, this.getCurrentUser()); } public Integer getFirstResult() { if (this.firstResult == null) { return 0; } return this.firstResult; } public Integer getFunctionalMetadatum() { return this.functionalMetadatum; } public Integer getFunctionalMetadatumValue() { return this.functionalMetadatumValue; } public Job getJob() { return this.job; } @SuppressWarnings("unchecked") @Factory("jobCrawledResources") public List<CrawledResource> getJobCrawledResources() { String queryString = "select cr from CrawledResource cr left join cr.semanticMetadatum left join cr.functionalMetadatum where cr.job=:job"; String queryCountString = "select count(*) from CrawledResource cr where cr.job=:job"; if (this.getFilterURLValue() != null && this.getFilterURLValue().length() > 0) { queryString += " and cr.url like :url"; queryCountString += " and cr.url like :url"; } if (this.getFilterMimeTypeValue() != null && this.getFilterMimeTypeValue().length() > 0) { queryString += " and cr.contentType like :mimeType"; queryCountString += " and cr.contentType like :mimeType"; } if (this.getSemanticMetadatumValue() != null && this.getSemanticMetadatumValue() != -1) { queryString += " and cr.semanticMetadatum.id=:semanticMetadatumId"; queryCountString += " and cr.semanticMetadatum.id=:semanticMetadatumId"; } if (this.getFunctionalMetadatumValue() != null && this.getFunctionalMetadatumValue() != -1) { queryString += " and cr.functionalMetadatum.id=:functionalMetadatumId"; queryCountString += " and cr.functionalMetadatum.id=:functionalMetadatumId"; } if (this.isFilterAnalyzed()) { if (this.isSemanticCrawlerUser(this.job.getCrawlerUser())) { queryString += " and (cr.semanticMetadatum is null and cr.deleted=false) "; queryCountString += " and (cr.semanticMetadatum is null and cr.deleted=false) "; } else { queryString += " and (cr.functionalMetadatum is null and cr.deleted=false) "; queryCountString += " and (cr.functionalMetadatum is null and cr.deleted=false) "; } } if (this.sortField != null && this.sortField.length() > 0) { queryString += " order by cr." + this.sortField; if (this.sortOrder != null) { if (this.sortOrder.equals("down")) { queryString += " desc "; } } } Query query = this.entityManager.createQuery(queryString).setParameter("job", this.job); Query queryCount = this.entityManager.createQuery(queryCountString).setParameter("job", this.job); if (this.getFilterURLValue() != null && this.getFilterURLValue().length() > 0) { query.setParameter("url", "%" + this.getFilterURLValue() + "%"); queryCount.setParameter("url", "%" + this.getFilterURLValue() + "%"); } if (this.getFilterMimeTypeValue() != null && this.getFilterMimeTypeValue().length() > 0) { query.setParameter("mimeType", "%" + this.getFilterMimeTypeValue() + "%"); queryCount.setParameter("mimeType", "%" + this.getFilterMimeTypeValue() + "%"); } if (this.getSemanticMetadatumValue() != null && this.getSemanticMetadatumValue() != -1) { query.setParameter("semanticMetadatumId", this.semanticMetadatumValue); queryCount.setParameter("semanticMetadatumId", this.semanticMetadatumValue); } if (this.getFunctionalMetadatumValue() != null && this.getFunctionalMetadatumValue() != -1) { query.setParameter("functionalMetadatumId", this.functionalMetadatumValue); queryCount.setParameter("functionalMetadatumId", this.functionalMetadatumValue); } if (this.firstResult == null) { this.firstResult = 0; } // System.out.println("*****" + this.pageSize); this.results = query.setFirstResult(this.firstResult).setMaxResults(this.pageSize).getResultList(); if (this.results != null) { this.setTotalResults((Long) queryCount.getSingleResult()); if (this.getFirstResult() > this.getTotalResults()) { this.setFirstResult(0); } } return this.results; } public Integer getJobId() { return this.job.getId(); } public String getJobSeeds() throws HeritrixException { List<String> jobSeeds = new ArrayList<String>(); if (this.profile == null || this.profile.getProfileName() == null) { jobSeeds = this.crawlerManager.getJobSeeds(null, this.job.getName(), this.getCurrentUser()); } else { jobSeeds = this.crawlerManager.getJobSeeds(this.profile.getProfileName(), this.job.getName(), this.getCurrentUser()); } StringBuffer b = new StringBuffer(); for (String j : jobSeeds) { b.append(j.trim() + "\n"); } return b.toString().trim(); } @Factory("jobValidationData") public Map<String, Number> getJobValidationData() { return this.getJobValidationDataForJob(this.job, this.entityManager); } private Map<String, Number> getJobValidationDataForJob(Job job, EntityManager entityManager) { Map<String, Number> res = new HashMap<String, Number>(); // risorse Long resTot = (Long) entityManager .createQuery("select count(cr.id) from CrawledResource cr where cr.job=:job") .setParameter("job", job).getSingleResult(); res.put("res-tot", resTot); Long res0_100 = (Long) entityManager .createQuery( "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 101") .setParameter("job", job).getSingleResult(); res.put("res0-100", res0_100); Long res0_50 = (Long) entityManager .createQuery( "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 51") .setParameter("job", job).getSingleResult(); res.put("res0-50", res0_50); Long res51_100 = (Long) entityManager.createQuery( "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 101 and cr.wordsNumber > 50 ") .setParameter("job", job).getSingleResult(); res.put("res51-100", res51_100); Long res101_1000 = (Long) entityManager.createQuery( "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 1001 and cr.wordsNumber>100") .setParameter("job", job).getSingleResult(); res.put("res101-1000", res101_1000); Long res101_500 = (Long) entityManager.createQuery( "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 501 and cr.wordsNumber>100") .setParameter("job", job).getSingleResult(); res.put("res101-500", res101_500); Long res501_1000 = (Long) entityManager.createQuery( "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 1001 and cr.wordsNumber>500") .setParameter("job", job).getSingleResult(); res.put("res501-1000", res501_1000); Long res1001_5000 = (Long) entityManager.createQuery( "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 5001 and cr.wordsNumber>1000") .setParameter("job", job).getSingleResult(); res.put("res1001-5000", res1001_5000); Long res5001_10000 = (Long) entityManager.createQuery( "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 10001 and cr.wordsNumber>5000") .setParameter("job", job).getSingleResult(); res.put("res5001-10000", res5001_10000); Long resGT10000 = (Long) entityManager .createQuery( "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber > 10001") .setParameter("job", job).getSingleResult(); res.put("res10000+", resGT10000); if (resTot > 0L) { res.put("res0-100p", res0_100 * 100.0 / resTot); res.put("res0-50p", res0_50 * 100.0 / resTot); res.put("res51-100p", res51_100 * 100.0 / resTot); res.put("res101-1000p", res101_1000 * 100.0 / resTot); res.put("res101-500p", res101_500 * 100.0 / resTot); res.put("res501-1000p", res501_1000 * 100.0 / resTot); res.put("res1001-5000p", res1001_5000 * 100.0 / resTot); res.put("res5001-10000p", res5001_10000 * 100.0 / resTot); res.put("res10000+p", resGT10000 * 100.0 / resTot); } String q = "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.deleted=false "; Long ndResTot = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); res.put("nd-res-tot", ndResTot); q = "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 100 and cr.deleted=false "; Long ndRes0_100 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); res.put("nd-res0-100", ndRes0_100); q = "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 51 and cr.deleted=false "; Long ndRes0_50 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); res.put("nd-res0-50", ndRes0_50); q = "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 101 and cr.wordsNumber > 50 and cr.deleted=false "; Long ndRes51_100 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); res.put("nd-res51-100", ndRes51_100); q = "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 1001 and cr.wordsNumber>101 and cr.deleted=false "; Long ndRes101_1000 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); res.put("nd-res101-1000", ndRes101_1000); q = "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 501 and cr.wordsNumber>100 and cr.deleted=false "; Long ndRes101_500 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); res.put("nd-res101-500", ndRes101_500); q = "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 1001 and cr.wordsNumber>500 and cr.deleted=false "; Long ndRes501_1000 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); res.put("nd-res501-1000", ndRes501_1000); q = "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 5001 and cr.wordsNumber>1000 and cr.deleted=false "; Long ndRes1001_5000 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); res.put("nd-res1001-5000", ndRes1001_5000); q = "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 10001 and cr.wordsNumber>5000 and cr.deleted=false "; Long ndRes5001_10000 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); res.put("nd-res5001-10000", ndRes5001_10000); q = "select count(cr.id) from CrawledResource cr where cr.job=:job and cr.wordsNumber > 10001 and cr.deleted=false "; Long ndResGT10000 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); res.put("nd-res10000+", ndResGT10000); if (ndResTot > 0L) { res.put("nd-res0-100p", ndRes0_100 * 100.0 / ndResTot); res.put("nd-res0-50p", ndRes0_50 * 100.0 / ndResTot); res.put("nd-res51-100p", ndRes51_100 * 100.0 / ndResTot); res.put("nd-res101-1000p", ndRes101_1000 * 100.0 / ndResTot); res.put("nd-res101-500p", ndRes101_500 * 100.0 / ndResTot); res.put("nd-res501-1000p", ndRes501_1000 * 100.0 / ndResTot); res.put("nd-res1001-5000p", ndRes1001_5000 * 100.0 / ndResTot); res.put("nd-res5001-10000p", ndRes5001_10000 * 100.0 / ndResTot); res.put("nd-res10000+p", ndResGT10000 * 100.0 / ndResTot); } // parole Long wordsTot = (Long) entityManager .createQuery("select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job") .setParameter("job", job).getSingleResult(); if (wordsTot == null) { wordsTot = 0L; } res.put("words-tot", wordsTot); Long words0_100 = (Long) entityManager.createQuery( "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 101") .setParameter("job", job).getSingleResult(); if (words0_100 == null) { words0_100 = 0L; } res.put("words0-100", words0_100); Long words0_50 = (Long) entityManager.createQuery( "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 51") .setParameter("job", job).getSingleResult(); if (words0_50 == null) { words0_50 = 0L; } res.put("words0-50", words0_50); Long words51_100 = (Long) entityManager.createQuery( "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 101 and cr.wordsNumber > 50 ") .setParameter("job", job).getSingleResult(); if (words51_100 == null) { words51_100 = 0L; } res.put("words51-100", words51_100); Long words101_1000 = (Long) entityManager.createQuery( "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 1001 and cr.wordsNumber>101") .setParameter("job", job).getSingleResult(); if (words101_1000 == null) { words101_1000 = 0L; } res.put("words101-1000", words101_1000); Long words101_500 = (Long) entityManager.createQuery( "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 501 and cr.wordsNumber>100") .setParameter("job", job).getSingleResult(); if (words101_500 == null) { words101_500 = 0L; } res.put("words101-500", words101_500); Long words501_1000 = (Long) entityManager.createQuery( "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 1001 and cr.wordsNumber>500") .setParameter("job", job).getSingleResult(); if (words501_1000 == null) { words501_1000 = 0L; } res.put("words501-1000", words501_1000); Long words1001_5000 = (Long) entityManager.createQuery( "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 5001 and cr.wordsNumber>1000") .setParameter("job", job).getSingleResult(); if (words1001_5000 == null) { words1001_5000 = 0L; } res.put("words1001-5000", words1001_5000); Long words5001_10000 = (Long) entityManager.createQuery( "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 10001 and cr.wordsNumber>5000") .setParameter("job", job).getSingleResult(); if (words5001_10000 == null) { words5001_10000 = 0L; } res.put("words5001-10000", words5001_10000); Long wordsGT10000 = (Long) entityManager.createQuery( "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber > 10001") .setParameter("job", job).getSingleResult(); if (wordsGT10000 == null) { wordsGT10000 = 0L; } res.put("words10000+", wordsGT10000); if (wordsTot > 0L) { res.put("words0-100p", words0_100 * 100.0 / wordsTot); res.put("words0-50p", words0_50 * 100.0 / wordsTot); res.put("words51-100p", words51_100 * 100.0 / wordsTot); res.put("words101-1000p", words101_1000 * 100.0 / wordsTot); res.put("words101-1000p", words101_1000 * 100.0 / wordsTot); res.put("words101-500p", words101_500 * 100.0 / wordsTot); res.put("words501-1000p", words501_1000 * 100.0 / wordsTot); res.put("words1001-5000p", words1001_5000 * 100.0 / wordsTot); res.put("words5001-10000p", words5001_10000 * 100.0 / wordsTot); res.put("words10000+p", wordsGT10000 * 100.0 / wordsTot); } // // parole analizzate // parole non cancellate q = "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.deleted=false "; Long ndWordsTot = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); if (ndWordsTot == null) { ndWordsTot = 0L; } res.put("nd-words-tot", ndWordsTot); q = "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 101 and cr.deleted=false "; Long ndWords0_100 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); if (ndWords0_100 == null) { ndWords0_100 = 0L; } res.put("nd-words0-100", ndWords0_100); q = "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 51 and cr.deleted=false "; Long ndWords0_50 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); if (ndWords0_50 == null) { ndWords0_50 = 0L; } res.put("nd-words0-50", ndWords0_50); q = "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 101 and cr.wordsNumber > 50 and cr.deleted=false "; Long ndWords51_100 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); if (ndWords51_100 == null) { ndWords51_100 = 0L; } res.put("nd-words51-100", ndWords51_100); q = "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 1001 and cr.wordsNumber>100 and cr.deleted=false "; Long ndWords101_1000 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); if (ndWords101_1000 == null) { ndWords101_1000 = 0L; } res.put("nd-words101-1000", ndWords101_1000); q = "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 501 and cr.wordsNumber>100 and cr.deleted=false "; Long ndWords101_500 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); if (ndWords101_500 == null) { ndWords101_500 = 0L; } res.put("nd-words101-500", ndWords101_500); q = "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 1001 and cr.wordsNumber>500 and cr.deleted=false "; Long ndWords501_1000 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); if (ndWords501_1000 == null) { ndWords501_1000 = 0L; } res.put("nd-words501-1000", ndWords501_1000); q = "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 5001 and cr.wordsNumber>1000 and cr.deleted=false "; Long ndWords1001_5000 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); if (ndWords1001_5000 == null) { ndWords1001_5000 = 0L; } res.put("nd-words1001-5000", ndWords1001_5000); q = "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber < 10001 and cr.wordsNumber>5000 and cr.deleted=false "; Long ndWords5001_10000 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); if (ndWords5001_10000 == null) { ndWords5001_10000 = 0L; } res.put("nd-words5001-10000", ndWords5001_10000); q = "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.wordsNumber > 10001 and cr.deleted=false "; Long ndWordsGT10000 = (Long) entityManager.createQuery(q).setParameter("job", job).getSingleResult(); if (ndWordsGT10000 == null) { ndWordsGT10000 = 0L; } res.put("nd-words10000+", ndWordsGT10000); if (ndWordsTot > 0L) { res.put("nd-words0-100p", ndWords0_100 * 100.0 / ndWordsTot); res.put("nd-words0-50p", ndWords0_50 * 100.0 / ndWordsTot); res.put("nd-words51-100p", ndWords51_100 * 100.0 / ndWordsTot); res.put("nd-words101-1000p", ndWords101_1000 * 100.0 / ndWordsTot); res.put("nd-words101-1000p", ndWords101_1000 * 100.0 / ndWordsTot); res.put("nd-words101-500p", ndWords101_500 * 100.0 / ndWordsTot); res.put("nd-words501-1000p", ndWords501_1000 * 100.0 / ndWordsTot); res.put("nd-words1001-5000p", ndWords1001_5000 * 100.0 / ndWordsTot); res.put("nd-words5001-10000p", ndWords5001_10000 * 100.0 / ndWordsTot); res.put("nd-words10000+p", ndWordsGT10000 * 100.0 / ndWordsTot); } return res; } @Factory("jobValidationResources") public List<CrawledResource> getJobValidationResources() { String queryString = "select cr from CrawledResource cr left join cr.semanticMetadatum left join cr.functionalMetadatum where cr.validation <> :v and cr.job=:job"; String queryCountString = "select count(*) from CrawledResource cr where cr.validation <> :v and cr.job=:job"; String queryTotalCountString = queryCountString; if (this.getFilterURLValue() != null && this.getFilterURLValue().length() > 0) { queryString += " and cr.url like :url"; queryCountString += " and cr.url like :url"; } if (this.getFilterMimeTypeValue() != null && this.getFilterMimeTypeValue().length() > 0) { queryString += " and cr.contentType like :mimeType"; queryCountString += " and cr.contentType like :mimeType"; } if (this.isFilterValidated()) { queryString += " and cr.validation <> " + CrawledResource.VALIDATED + " "; queryCountString += " and cr.validation <> " + CrawledResource.VALIDATED + " "; } if (this.sortField != null && this.sortField.length() > 0) { queryString += " order by cr." + this.sortField; if (this.sortOrder != null) { if (this.sortOrder.equals("down")) { queryString += " desc "; } } } Query query = this.entityManager.createQuery(queryString).setParameter("job", this.job).setParameter("v", CrawledResource.NOT_CHOOSEN_FOR_VALIDATION); Query queryCount = this.entityManager.createQuery(queryCountString).setParameter("job", this.job) .setParameter("v", CrawledResource.NOT_CHOOSEN_FOR_VALIDATION); Query queryTotalCount = this.entityManager.createQuery(queryTotalCountString).setParameter("job", this.job) .setParameter("v", CrawledResource.NOT_CHOOSEN_FOR_VALIDATION); if (this.getFilterURLValue() != null && this.getFilterURLValue().length() > 0) { query.setParameter("url", "%" + this.getFilterURLValue() + "%"); queryCount.setParameter("url", "%" + this.getFilterURLValue() + "%"); } if (this.getFilterMimeTypeValue() != null && this.getFilterMimeTypeValue().length() > 0) { query.setParameter("mimeType", "%" + this.getFilterMimeTypeValue() + "%"); queryCount.setParameter("mimeType", "%" + this.getFilterMimeTypeValue() + "%"); } if (this.firstResult == null) { this.firstResult = 0; } // System.out.println("*****" + this.pageSize); this.results = query.setFirstResult(this.firstResult).setMaxResults(this.pageSize).getResultList(); this.setTotalSelectedResourcesNumber((Long) queryTotalCount.getSingleResult()); if (this.results != null) { this.setTotalResults((Long) queryCount.getSingleResult()); if (this.getFirstResult() > this.getTotalResults()) { this.setFirstResult(0); } } this.entityManager.clear(); return this.results; } @Factory("jobWordsCount") public Long getJobWordsCount() { return (Long) this.entityManager.createQuery( "select sum(cr.wordsNumber) from CrawledResource cr where cr.job=:job and cr.deleted=false and cr.noMoreAvailable=false") .setParameter("job", this.job).getSingleResult(); } public List<String> getLanguagesToBeDeleted() { return this.languagesToBeDeleted; } public long getLastFirstResult() { return (this.getTotalResults() - 1) / this.pageSize * this.pageSize; } public Date getLastUpdate() { return (Date) this.entityManager .createQuery("select max(cr.lastModified) from CrawledResource cr where cr.job=:job") .setParameter("job", this.job).getSingleResult(); } public String getMimeTypeList() { return StringUtils.join(this.job.getChosenMimeTypes(), ", "); } public Integer getNextFirstResult() { return (this.getFirstResult() == null ? 0 : this.getFirstResult()) + this.pageSize; } public int getPageSize() { // System.out.println(this.pageSize); return this.pageSize; } public String getPercentCompleted() { Double percent = this.getPercentDouble(); return JobManager.DECIMAL_FORMAT.format(percent); } public String getPercentCompletedInt() { return "" + Math.round(this.getPercentDouble()); } private Double getPercentDouble() { String completed = this.getCompletedResources(); String total = this.getTotalResources(); if (completed == null || total == null) { return 0.0; } Long totalLong = Long.parseLong(total); if (totalLong < 1L) { return 0.0; } Long completedLong = Long.parseLong(completed); Double percent = completedLong * 1.0 / totalLong * 100; return percent; } public String getPollInterval() { return this.pollInterval; } public List<PoSLine> getPosText() { return this.posText; } public Integer getPreviousFirstResult() { if (this.pageSize > (this.getFirstResult() == null ? 0 : this.getFirstResult())) { return 0; } else { return this.getFirstResult() - this.pageSize; } } public Long getQueuedURICount() throws HeritrixException, CrawlingFileException { return this.crawlerManager.getQueuedURICount(this.job, this.getCurrentUser()); } public Long getRawBytes() throws HeritrixException, CrawlingFileException { if (this.crawlerManager.getCrawlerEngineStatus().equals(CrawlerManager.RUNNING)) { return this.crawlerManager.getRawBytes(this.job.getName(), this.getCurrentUser()); } return 0L; } public Long getResourcesBytes() { return (Long) this.entityManager .createQuery("select sum(cr.length) from CrawledResource cr where cr.job=:job and cr.deleted=false") .setParameter("job", this.job).getSingleResult(); } public Integer getResultCount() { if (this.results != null) { return this.results.size(); } return 0; } public Integer getSemanticMetadatum() { return this.semanticMetadatum; } public Integer getSemanticMetadatumValue() { return this.semanticMetadatumValue; } public String getSortField() { return this.sortField; } public String getSortOrder() { return this.sortOrder; } public String getTestAfter() { return this.testAfter; } public String getTestBefore() { return this.testBefore; } public String getTestOutput() { return this.testOutput; } public String getTotalResources() { Long all = (Long) this.entityManager .createQuery("select count(*) from CrawledResource cr where cr.job=:job") .setParameter("job", this.job).getSingleResult(); if (all == 0L) { return "0"; } return Long.toString(all); } public Long getTotalResults() { // System.out.println(this.totalResults); return this.totalResults; } public Long getTotalSelectedResourcesNumber() { return this.totalSelectedResourcesNumber; } public long getValidatedResourcesNumber() { return (Long) this.entityManager .createQuery("select count (cr) from CrawledResource cr where cr.job=:j and cr.validation = " + CrawledResource.VALIDATED) .setParameter("j", this.getJob()).getSingleResult(); } public Integer getWordsLimitDeletionNumber() { return this.wordsLimitDeletionNumber; } public String indexSingleCrawledResource(CrawledResource crawledResource) { crawledResource.setProcessed(Parameter.INDEXING_PHASE); this.entityManager.merge(crawledResource); this.entityManager.flush(); // this.asyncIndexer.indexSingleCrawledResource(crawledResource.getId(), // null, true); return "OK"; } @Factory("jobs") public List<Job> initJobs() throws HeritrixException { return this.crawlerManager.getJobs(this.getCurrentUser()); } @Create public void initParams() { this.perlUser = this.entityManager.find(Parameter.class, Parameter.PERL_CLEANER_USER.getKey()).getValue(); this.perlPw = this.entityManager.find(Parameter.class, Parameter.PERL_CLEANER_PW.getKey()).getValue(); this.cleanerPath = this.entityManager.find(Parameter.class, Parameter.PERL_CLEANER_PATH.getKey()) .getValue(); this.perlCleanerTemplate = this.entityManager .find(Parameter.class, Parameter.PERL_CLEANER_TEMPLATE.getKey()).getValue(); } public boolean isFilterAnalyzed() { return this.filterAnalyzed; } public boolean isFilterValidated() { return this.filterValidated; } public boolean isJobCompleted() throws HeritrixException { return this.job.getJobStage().equals(CrawlStatus.ABORTED.toString()) || this.job.getJobStage().equals(CrawlStatus.FINISHED.toString()); } public boolean isJobPaused() throws HeritrixException { String jobStage = this.job.getJobStage(); if (jobStage == null) { return false; } return jobStage.equals(CrawlStatus.PAUSED.toString()); } public boolean isJobRunning() throws HeritrixException { return this.job.getJobStage().equals(CrawlStatus.RUNNING.toString()); } public boolean isNextExists() { return this.results != null && this.results.size() >= this.pageSize && this.getTotalResults() > this.getFirstResult() + this.pageSize; } public boolean isPollEnabled() { if (this.getPollInterval() == null || this.getPollInterval().equals("0")) { return false; } return true; } public boolean isPreviousExists() { if (this.results != null) { return this.getFirstResult() != null && this.getFirstResult() != 0; } return false; } private boolean isSemanticCrawlerUser(User crawlerUser) { String body = crawlerUser.getBody(); if (body != null) { if (body.contains("LABLITA") || body.contains("UNITO") || body.contains("UNIFI") || body.contains("UNISI")) { return false; } } return true; } public boolean isValidationToBeSaved() { return this.validationToBeSaved; } public boolean noCrawledResources(Integer jobId) { return this.entityManager.createQuery("from CrawledResource cr where cr.job.id=:jobId") .setParameter("jobId", jobId).setMaxResults(1).getResultList().size() != 1; } public void pauseJob() throws HeritrixException { this.crawlerManager.pauseJob(this.job.getName(), this.getCurrentUser()); } private void persistHandle(QuartzTriggerHandle handle) throws IOException, SchedulerException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); ObjectOutputStream oos; oos = new ObjectOutputStream(baos); oos.writeObject(handle); oos.close(); ScheduledJobHandle sjh = new ScheduledJobHandle(); sjh.setJobFullName(handle.getTrigger().getFullJobName()); sjh.setSerializedHandle(baos.toByteArray()); sjh.setJob(this.job); this.entityManager.persist(sjh); this.job.setScheduledJobHandle(sjh); } public void refresh() throws HeritrixException { this.crawlerManager.updateJobsList(this.getCurrentUser()); } private void removePreviousScheduling() throws IOException, ClassNotFoundException, SchedulerException { ScheduledJobHandle sjh = this.job.getScheduledJobHandle(); if (sjh == null) { return; } byte[] serializedHandle = sjh.getSerializedHandle(); ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(serializedHandle)); QuartzTriggerHandle qth = (QuartzTriggerHandle) ois.readObject(); ois.close(); qth.cancel(); sjh.setJob(null); this.job.setScheduledJobHandle(null); this.entityManager.remove(sjh); } public String reprocessResources() { this.job.setMappedResources(false); List<CrawledResource> crawledResources = new ArrayList<CrawledResource>(this.job.getCrawledResources()); for (CrawledResource cr : crawledResources) { this.job.getCrawledResources().remove(cr); this.entityManager.remove(cr); // this.asyncIndexer.deleteSingleCrawledResource(cr.getId()); } List<CrawledResource> noMoreAvailableResources = new ArrayList<CrawledResource>( this.job.getNoMoreAvailableResources()); for (CrawledResource cr : noMoreAvailableResources) { this.job.getCrawledResources().remove(cr); this.entityManager.remove(cr); } this.job.setValidationNotes(null); this.job.setValidationStatus(Job.TO_BE_VALIDATED); this.entityManager.merge(this.job); this.entityManager.flush(); return "OK"; } public String resetFactory() { Contexts.getConversationContext().set("jobCrawledResources", null); return "RESET"; } public String resetValidationData() { this.entityManager .createQuery("update CrawledResource cr set cr.validation=" + CrawledResource.NOT_CHOOSEN_FOR_VALIDATION + ", cr.validationStatus=" + CrawledResource.TO_BE_VALIDATED_RESOURCE + " where cr.job=:j") .setParameter("j", this.getJob()).executeUpdate(); this.getJob().setValidationStatus(Job.TO_BE_VALIDATED); this.getJob().setValidationNotes(null); this.getJob().setValidationThreshold(10); this.entityManager.merge(this.getJob()); this.entityManager.flush(); this.entityManager.clear(); this.resetValidationFactory(); return "OK"; } public String resetValidationFactory() { Contexts.getConversationContext().set("jobValidationResources", null); return "RESET"; } public List<CrawledResource> resourcesSample() { return this.job.getCrawledResources().subList(0, Math.min(this.job.getCrawledResources().size(), 30)); } public void resumeJob() throws HeritrixException { this.crawlerManager.resumeJob(this.job.getName(), this.getCurrentUser()); } public void retrieveCleanedText(CrawledResource cr) { File resourceDir = new File( FilenameUtils.getFullPath(cr.getArcFile().replaceAll("__\\d+", "")) + JobManager.RESOURCESDIR); File cleanedTextFile = new File(resourceDir, cr.getDigest() + ".txt"); if (cleanedTextFile.exists() && cleanedTextFile.canRead()) { try { this.setCleanedText(FileUtils.readFileToString(cleanedTextFile)); } catch (IOException e) { e.printStackTrace(); } } else { this.setCleanedText(""); } } public void retrievePoSText(CrawledResource cr) { File resourceDir = new File( FilenameUtils.getFullPath(cr.getArcFile().replaceAll("__\\d+", "")) + JobManager.RESOURCESDIR); File posTextFile = new File(resourceDir, cr.getDigest() + ".txt.pos"); List<PoSLine> posLines = new ArrayList<PoSLine>(); try { List<String> lines = FileUtils.readLines(posTextFile); StrTokenizer tokenizer = StrTokenizer.getTSVInstance(); for (String l : lines) { tokenizer.reset(l); String[] tokens = tokenizer.getTokenArray(); if (tokens.length == 3) { PoSLine poSLine = new PoSLine(); poSLine.setForm(tokens[0].trim()); poSLine.setPosTag(tokens[1].trim()); poSLine.setLemma(tokens[2].trim()); posLines.add(poSLine); } } } catch (IOException e) { } this.setPosText(posLines); } public void saveJob() { if (this.job != null) { this.entityManager.persist(this.job); } } public void savePeriodicity() throws HeritrixException, FileHandlingException, JobHandlingException { this.entityManager.merge(this.job); User user = this.getCurrentUser(); try { this.removePreviousScheduling(); if (this.job.getPeriodicity().equals("periodic")) { String cronString = this.calculateCronString(); QuartzTriggerHandle handle = this.scheduledJobExecutor.schedulePeriodicJob(new Date(), cronString, this.job.getEndDate(), this.job, user); this.persistHandle(handle); this.job.setCronData(cronString); } else if (this.job.getPeriodicity().equals("once")) { QuartzTriggerHandle handle = this.scheduledJobExecutor.scheduleRunOnceJob(this.job.getFirstDate(), this.job, user); this.persistHandle(handle); this.job.setCronData(this.job.getFirstDate().toString()); } else { this.job.setCronData("never"); } } catch (IOException e) { throw new FileHandlingException(this.messages.get("FileRetrievingError")); } catch (ClassNotFoundException e) { throw new JobHandlingException(this.messages.get("JobRetrievingError")); } catch (SchedulerException e) { throw new JobHandlingException(this.messages.get("JobRetrievingError")); } } public String saveValidationThreshold() { if (this.getJob().getValidationStatus().equals(Job.VALIDATED_NOK) || this.getJob().getValidationStatus().equals(Job.VALIDATED_OK)) { this.getJob().setValidationStatus(this.calculateValidationResult()); } this.entityManager.merge(this.getJob()); this.resetValidationFactory(); return "OK"; } private void selectResourceToBeValidated(int amount, int startIdx, int endIdx, EntityManager entityManager2, Job j) throws NoSuchAlgorithmException, IOException { if (amount < 1) { return; } List<Integer> ids = new ArrayList<Integer>(); Map<String, Integer> hashes = new HashMap<String, Integer>(); int cursize = hashes.size(); while (hashes.size() < amount) { for (CrawledResource cr : (List<CrawledResource>) entityManager2.createQuery( "from CrawledResource cr where cr.job=:j and cr.wordsNumber<:eidx and cr.wordsNumber>=:sidx and cr.deleted=false order by rand()") .setParameter("j", j).setParameter("sidx", startIdx).setParameter("eidx", endIdx) .setMaxResults(amount).getResultList()) { if (cr.getExtractedTextHash() == null) { File f = new File(FilenameUtils.getFullPath(cr.getArcFile()) + JobMapperMonitor.RESOURCESDIR + cr.getDigest() + ".txt"); if (f.exists() && f.canRead()) { cr.setExtractedTextHash(MD5DigestCreator.getMD5Digest(f)); entityManager2.merge(cr); } else { continue; } } hashes.put(cr.getExtractedTextHash(), cr.getId()); } if (cursize == hashes.size()) { break; } else { cursize = hashes.size(); } } ids.addAll(hashes.values()); if (ids.size() > 0) { entityManager2.createQuery("update CrawledResource cr set cr.validation=:v where cr.id in (:ids)") .setParameter("ids", ids).setParameter("v", CrawledResource.CHOOSEN_FOR_VALIDATION) .executeUpdate(); } } @Asynchronous @Transactional public String selectValidationResources(Integer jobId) { int transactionTimeoutSeconds = 600; this.userTx = (UserTransaction) org.jboss.seam.Component .getInstance("org.jboss.seam.transaction.transaction"); try { this.userTx.setTransactionTimeout(transactionTimeoutSeconds); // set timeout to // 10 mins if (!this.userTx.isActive()) { this.userTx.begin(); } this.entityManager.joinTransaction(); Job j = this.entityManager.find(Job.class, jobId); j.setValidationStatus(Job.VALIDATION_WAIT); this.entityManager.persist(j); this.entityManager.flush(); this.userTx.commit(); if (!this.userTx.isActive()) { this.userTx.begin(); } this.entityManager.joinTransaction(); Number resourcesWordsNumberAverage = (Number) this.entityManager .createQuery("select avg(cr.wordsNumber) from CrawledResource cr where cr.job=:j") .setParameter("j", j).getSingleResult(); double percent = 0.0; if (resourcesWordsNumberAverage.doubleValue() < 100.0) { percent = 0.01; } else if (resourcesWordsNumberAverage.doubleValue() > 99100.0) { percent = 1; } else { percent = resourcesWordsNumberAverage.doubleValue() / 1000 + 0.9; } percent = percent / 100; if (!this.userTx.isActive()) { this.userTx.begin(); } this.entityManager.joinTransaction(); Map<String, Number> jobValidationData = this.getJobValidationDataForJob(j, this.entityManager); this.entityManager.flush(); this.userTx.commit(); int n0_100 = new Double(Math.ceil(jobValidationData.get("nd-res0-100").doubleValue() * percent)) .intValue(); if (!this.userTx.isActive()) { this.userTx.begin(); } this.entityManager.joinTransaction(); this.selectResourceToBeValidated(n0_100, 0, 100, this.entityManager, j); this.entityManager.flush(); this.userTx.commit(); int n101_1000 = new Double(Math.ceil(jobValidationData.get("nd-res101-1000").doubleValue() * percent)) .intValue(); if (!this.userTx.isActive()) { this.userTx.begin(); } this.entityManager.joinTransaction(); this.selectResourceToBeValidated(n101_1000, 101, 1000, this.entityManager, j); this.entityManager.flush(); this.userTx.commit(); int n1001_5000 = new Double(Math.ceil(jobValidationData.get("nd-res1001-5000").doubleValue() * percent)) .intValue(); if (!this.userTx.isActive()) { this.userTx.begin(); } this.entityManager.joinTransaction(); this.selectResourceToBeValidated(n1001_5000, 1001, 5000, this.entityManager, j); this.entityManager.flush(); this.userTx.commit(); int n5001_10000 = new Double( Math.ceil(jobValidationData.get("nd-res5001-10000").doubleValue() * percent)).intValue(); if (!this.userTx.isActive()) { this.userTx.begin(); } this.entityManager.joinTransaction(); this.selectResourceToBeValidated(n5001_10000, 5001, 10000, this.entityManager, j); this.entityManager.flush(); this.userTx.commit(); int n10000 = new Double(Math.ceil(jobValidationData.get("nd-res10000+").doubleValue() * percent)) .intValue(); if (!this.userTx.isActive()) { this.userTx.begin(); } this.entityManager.joinTransaction(); this.selectResourceToBeValidated(n10000, 10001, Integer.MAX_VALUE, this.entityManager, j); this.entityManager.flush(); this.userTx.commit(); if (!this.userTx.isActive()) { this.userTx.begin(); } j.setValidationStatus(Job.VALIDATION_IN_PROGRESS); this.entityManager.joinTransaction(); this.entityManager.merge(j); this.entityManager.flush(); this.userTx.commit(); if (!this.userTx.isActive()) { this.userTx.begin(); } } catch (Exception e) { try { if (this.userTx != null && this.userTx.isActive()) { this.userTx.rollback(); } } catch (IllegalStateException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (SecurityException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (SystemException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } e.printStackTrace(); } finally { } return "OK"; } private void setCleanedText(String cleanedText) { this.cleanedText = cleanedText; } public void setDetachedJobId(Integer jobId) { if (jobId != null) { this.job = this.entityManager.find(Job.class, jobId); Session sess = (Session) this.entityManager.getDelegate(); sess.evict(this.job); } else { this.job = null; } } public void setFilterAnalyzed(boolean filterAnalyzed) { this.filterAnalyzed = filterAnalyzed; } public void setFilterMimeTypeValue(String filterMimeTypeValue) { this.filterMimeTypeValue = filterMimeTypeValue; } public void setFilterNameValue(String filterNameValue) { this.filterNameValue = filterNameValue; } public void setFilterResStatusValue(Integer filterResStatusValue) { this.filterResStatusValue = filterResStatusValue; } public void setFilterStatusValue(Integer filterStatusValue) { this.filterStatusValue = filterStatusValue; } public void setFilterURLValue(String filterURLValue) { this.filterURLValue = filterURLValue; } public void setFilterUserValue(String filterUserValue) { this.filterUserValue = filterUserValue; } public void setFilterValidated(boolean filterValidated) { this.filterValidated = filterValidated; } public void setFirstResult(Integer firstResult) { this.firstResult = firstResult; } public void setFunctionalMetadatum(Integer functionalMetadatum) { this.functionalMetadatum = functionalMetadatum; } public void setFunctionalMetadatumValue(Integer functionalMetadatumValue) { this.functionalMetadatumValue = functionalMetadatumValue; } public void setJob(Job job) { this.job = job; } public void setJobId(Integer jobId) { if (jobId != null) { this.job = this.entityManager.find(Job.class, jobId); } else { this.job = null; } if (this.job.getCleaningScript() == null || this.job.getCleaningScript().trim().length() < 1) { try { this.job.setCleaningScript(FileUtils.readFileToString(new File(this.perlCleanerTemplate))); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } public void setLanguagesToBeDeleted(List<String> languagesToBeDeleted) { this.languagesToBeDeleted = languagesToBeDeleted; } public void setPageSize(int resPerPage) { this.pageSize = resPerPage; } public void setPollInterval(String pollInterval) { this.pollInterval = pollInterval; } public void setPosText(List<PoSLine> posText) { this.posText = posText; } public void setSemanticMetadatum(Integer semanticMetadatum) { this.semanticMetadatum = semanticMetadatum; } public void setSemanticMetadatumValue(Integer semanticMetadatumValue) { this.semanticMetadatumValue = semanticMetadatumValue; } public void setSortField(String sortField) { this.sortField = sortField; } public void setSortOrder(String sortOrder) { this.sortOrder = sortOrder; } public void setTotalResults(Long singleResult) { this.totalResults = singleResult; } public void setTotalSelectedResourcesNumber(Long singleResult) { this.totalSelectedResourcesNumber = singleResult; } public void setValidation(CrawledResource crawledResource, Integer validationStatus) { if (crawledResource != null && validationStatus >= CrawledResource.TO_BE_VALIDATED_RESOURCE && validationStatus <= CrawledResource.NOT_VALID_RESOURCE) { crawledResource.setValidationStatus(validationStatus); if (!validationStatus.equals(CrawledResource.TO_BE_VALIDATED_RESOURCE)) { crawledResource.setValidation(CrawledResource.VALIDATED); } else { crawledResource.setValidation(CrawledResource.CHOOSEN_FOR_VALIDATION); } this.setValidationToBeSaved(true); // this.entityManager.merge(crawledResource); } // if (this.getValidatedResourcesNumber() == 0) { // this.job.setValidationStatus(Job.TO_BE_VALIDATED); // } else if (this.getValidatedResourcesNumber() < this // .getTotalSelectedResourcesNumber()) { // this.job.setValidationStatus(Job.VALIDATION_IN_PROGRESS); // } else { // this.job.setValidationStatus(this.calculateValidationResult()); // } // this.entityManager.merge(this.job); } public void setValidationToBeSaved(boolean validationToBeSaved) { this.validationToBeSaved = validationToBeSaved; } public void setWordsLimitDeletionNumber(Integer wordsLimitDeletionNumber) { this.wordsLimitDeletionNumber = wordsLimitDeletionNumber; } public void startJob() throws HeritrixException { this.crawlerManager.startJob(this.job.getName(), this.getCurrentUser()); } public void stopJob() throws HeritrixException { this.crawlerManager.stopJob(this.job.getName(), this.getCurrentUser()); } public void testScript(CrawledResource cr) { JobCleaner jobCleaner = new JobCleaner(this.perlUser, this.perlPw, this.cleanerPath, cr.getJob().getCleaningScript(), null); jobCleaner.testScript(cr, cr.getJob().getCleaningScript()); this.testBefore = jobCleaner.getTestBefore(); this.testAfter = jobCleaner.getTestAfter(); this.testOutput = jobCleaner.getTestOutput(); } public void toggleDeleted(CrawledResource crawledResource) { if (crawledResource != null) { crawledResource.setDeleted(!crawledResource.isDeleted()); if (crawledResource.isDeleted() && crawledResource.isIndexed()) { crawledResource.setProcessed(Parameter.FINISHED); // this.asyncIndexer.deleteSingleCrawledResource(crawledResource // .getId()); } else if (!crawledResource.isDeleted() && !crawledResource.isIndexed()) { crawledResource.setProcessed(Parameter.INDEXING_PHASE); // this.asyncIndexer.indexSingleCrawledResource( // crawledResource.getId(), null, true); } this.entityManager.merge(crawledResource); } } }