it.drwolf.ridire.session.async.IndexMerger.java Source code

Java tutorial

Introduction

Here is the source code for it.drwolf.ridire.session.async.IndexMerger.java

Source

/*******************************************************************************
 * Copyright 2013 Universit degli Studi di Firenze
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package it.drwolf.ridire.session.async;

import it.drwolf.ridire.entity.CrawledResource;
import it.drwolf.ridire.entity.Job;
import it.drwolf.ridire.entity.Parameter;
import it.drwolf.ridire.index.ContextsIndexManager;

import java.io.File;
import java.io.IOException;
import java.util.List;

import javax.persistence.EntityManager;
import javax.transaction.SystemException;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.jboss.seam.Component;
import org.jboss.seam.annotations.In;
import org.jboss.seam.annotations.Name;
import org.jboss.seam.annotations.async.Asynchronous;
import org.jboss.seam.transaction.UserTransaction;

import com.jcraft.jsch.Channel;
import com.jcraft.jsch.ChannelSftp;
import com.jcraft.jsch.JSch;
import com.jcraft.jsch.JSchException;
import com.jcraft.jsch.Session;
import com.jcraft.jsch.SftpException;

@Name("indexMerger")
public class IndexMerger {

    private static final int RESBATCH = 20;
    @In(create = true)
    private FlagBearer flagBearer;
    private UserTransaction userTx;
    private EntityManager entityManager1;
    // private ContextsIndexManager contextsIndexManager;

    @In
    private EntityManager entityManager;
    private ChannelSftp c;
    private JSch jSch;
    private Session session;

    private void disconnect() {
        this.c.disconnect();
        this.session.disconnect();
    }

    private boolean isAlreadyIndexed(String www1localstore, String jobName, CrawledResource cr) {
        File crFile = new File(www1localstore + System.getProperty("file.separator") + jobName
                + System.getProperty("file.separator") + cr.getDigest() + ".txt.pos");
        return crFile.exists() && crFile.canRead();
    }

    private boolean isAlreadyMerged(Job j, long crSize, String www1localstore) {
        // check if dir exists and check indexed resources
        File jobDir = new File(www1localstore + System.getProperty("file.separator") + j.getName());
        if (jobDir == null || !jobDir.exists() || jobDir.listFiles().length < crSize) {
            return false;
        }
        return true;
    }

    @Asynchronous
    public void mergeIndexes() {
        final int CORPUS_SIZE_LIMIT = 100000000;
        this.flagBearer.setMergingStopped(false);
        String username = this.entityManager.find(Parameter.class, Parameter.WWW1_USERNAME.getKey()).getValue();
        String password = this.entityManager.find(Parameter.class, Parameter.WWW1_PW.getKey()).getValue();
        String host = this.entityManager.find(Parameter.class, Parameter.WWW1_HOST.getKey()).getValue();
        String www1localstore = this.entityManager.find(Parameter.class, Parameter.WWW1_LOCAL_STORE.getKey())
                .getValue();
        this.userTx = (UserTransaction) org.jboss.seam.Component
                .getInstance("org.jboss.seam.transaction.transaction");
        int transferredWords = 0;
        try {
            this.userTx.setTransactionTimeout(10 * 10 * 60);
            // 10 mins
            this.userTx.begin();
            this.entityManager1 = (EntityManager) Component.getInstance("entityManager1");
            this.contextsIndexManager = (ContextsIndexManager) Component.getInstance("contextsIndexManager");
            this.entityManager1.joinTransaction();
            List<Integer> jobs = this.entityManager1
                    .createQuery("select j.id from Job j where j.validationStatus=:procstatus and indexed=true")
                    .setParameter("procstatus", Job.VALIDATED_OK).getResultList();
            this.entityManager1.flush();
            this.entityManager1.clear();
            this.userTx.commit();
            int transferredFiles = 0;
            for (long jobCount = 0; jobCount < jobs.size(); jobCount++) {
                if (!this.userTx.isActive()) {
                    this.userTx.begin();
                }
                int jobResourcesCount = 0;
                this.entityManager1.joinTransaction();
                Job j = this.entityManager1.find(Job.class, jobs.get((int) jobCount));
                long crawledResourcesSize = (Long) this.entityManager1.createQuery(
                        "select count(cr.id) from CrawledResource cr where cr.job.id=:jobid and cr.processed=:indexed")
                        .setParameter("jobid", j.getId()).setParameter("indexed", Parameter.INDEXED)
                        .getSingleResult();
                System.out.println("Merging job: " + j.getName() + " " + crawledResourcesSize + "(" + jobCount + "/"
                        + jobs.size() + ")");
                if (this.isAlreadyMerged(j, crawledResourcesSize, www1localstore)) {
                    System.out.println("Merging job " + j.getName() + ". Already merged.");
                    this.entityManager1.flush();
                    this.entityManager1.clear();
                    this.userTx.commit();
                    continue;
                }
                this.entityManager1.flush();
                this.entityManager1.clear();
                this.userTx.commit();
                for (int resCount = 0; resCount < crawledResourcesSize; resCount += IndexMerger.RESBATCH) {
                    if (!this.userTx.isActive()) {
                        this.userTx.begin();
                    }
                    this.entityManager1.joinTransaction();
                    List<CrawledResource> crawledResources = this.entityManager1.createQuery(
                            "from CrawledResource cr where cr.job.id=:jobid and cr.processed=:indexed order by cr.id")
                            .setParameter("jobid", j.getId()).setParameter("indexed", Parameter.INDEXED)
                            .setFirstResult(resCount).setMaxResults(IndexMerger.RESBATCH).getResultList();
                    for (CrawledResource cr : crawledResources) {
                        jobResourcesCount++;
                        if (jobResourcesCount % 100 == 0) {
                            System.out.println(
                                    "Merging status: " + jobResourcesCount / (crawledResourcesSize * 1.0f) * 100
                                            + "%" + " (" + jobResourcesCount + "/" + crawledResourcesSize + ")");
                        }
                        if (this.isAlreadyIndexed(www1localstore, j.getName(), cr)) {
                            continue;
                        }
                        if (transferredFiles % 30 == 0) {
                            this.openConnection(username, password, host);
                        }
                        try {
                            this.transferFile(www1localstore, j.getName(), cr);
                            this.contextsIndexManager.indexExternalCrawledResource(j.getName(),
                                    cr.getSemanticMetadatum() == null ? ""
                                            : cr.getSemanticMetadatum().getDescription(),
                                    cr.getFunctionalMetadatum() == null ? ""
                                            : cr.getFunctionalMetadatum().getDescription(),
                                    j.getId(), cr.getDigest());
                        } catch (JSchException jSchException) {
                            jSchException.printStackTrace();
                        } catch (SftpException sftpException) {
                            System.err.println("No such file: " + cr.getDigest());
                        } catch (IOException ioException) {
                            ioException.printStackTrace();
                        }
                        transferredFiles++;
                        transferredWords += cr.getWordsNumber();
                        if (transferredFiles % 30 == 0) {
                            this.disconnect();
                        }
                    }
                    this.entityManager1.flush();
                    this.entityManager1.clear();
                    this.userTx.commit();
                    if (this.flagBearer.isMergingStopped()) {
                        this.disconnect();
                        this.contextsIndexManager.closeIndexWriter();
                        return;
                    }
                    if (transferredWords > CORPUS_SIZE_LIMIT) {
                        break;
                    }
                }
                if (transferredWords > CORPUS_SIZE_LIMIT) {
                    break;
                }
            }
            // this.contextsIndexManager = (ContextsIndexManager) Component
            // .getInstance("contextsIndexManager");
            this.entityManager1.flush();
            this.entityManager1.clear();
            this.userTx.commit();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (this.userTx != null && this.userTx.isActive()) {
                    this.userTx.rollback();
                }
            } catch (IllegalStateException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (SecurityException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (SystemException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

    private void openConnection(String username, String password, String host) throws JSchException {
        if (this.jSch == null) {
            this.jSch = new JSch();
        }
        this.session = this.jSch.getSession(username, host);
        java.util.Properties config = new java.util.Properties();
        config.put("StrictHostKeyChecking", "no");
        this.session.setConfig(config);
        this.session.setPassword(password);
        this.session.connect();
        Channel channel = this.session.openChannel("sftp");
        channel.connect();
        this.c = (ChannelSftp) channel;
    }

    private void transferFile(String www1localstore, String jobName, CrawledResource cr)
            throws JSchException, SftpException, IOException {
        int mode = ChannelSftp.OVERWRITE;
        String localDir = www1localstore + System.getProperty("file.separator") + jobName;
        FileUtils.forceMkdir(new File(localDir));
        this.c.get(
                FilenameUtils.getFullPath(cr.getArcFile()) + System.getProperty("file.separator") + "resources"
                        + System.getProperty("file.separator") + cr.getDigest() + ".txt.pos",
                localDir + System.getProperty("file.separator") + cr.getDigest() + ".txt.pos", null, mode);
    }
}