de.tudarmstadt.ukp.csniper.resbuild.EvaluationItemFixer2.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.csniper.resbuild.EvaluationItemFixer2.java

Source

/*******************************************************************************
 * Copyright 2013
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.csniper.resbuild;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine;

import java.io.File;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;

import de.tudarmstadt.ukp.csniper.resbuild.stuff.DummySentenceSplitter;
import de.tudarmstadt.ukp.csniper.webapp.evaluation.model.Corpus;
import de.tudarmstadt.ukp.csniper.webapp.search.CorpusService;
import de.tudarmstadt.ukp.csniper.webapp.search.SearchEngine;
import de.tudarmstadt.ukp.csniper.webapp.search.cqp.ContextUnit;
import de.tudarmstadt.ukp.csniper.webapp.search.cqp.CqpEngine;
import de.tudarmstadt.ukp.csniper.webapp.search.cqp.CqpQuery;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordParser;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter;

public class EvaluationItemFixer2 {
    private static final Log log = LogFactory.getLog(EvaluationItemFixer2.class);

    private static final String HOST = "jdbc:mysql://localhost/";
    private static final String DATABASE = "csniper";
    private static final String USER = "root";
    private static final String PASSWORD = "gugaguga";

    // private static final String CQP_EXECUTABLE = "/opt/imscwb/cqp";
    // private static final String CORPUS_REPOSITORY = "/srv/csniper";

    // private static final String HOST = "jdbc:mysql://127.0.0.1/";
    // private static final String DATABASE = "csniper";
    // private static final String USER = "root";
    // private static final String PASSWORD = "gugaguga";

    private static final String CQP_EXECUTABLE = "D:\\ukp\\cwb-3.4.3\\bin\\cqp.exe";
    private static final String REPOSITORY = "D:\\ukp\\data\\csniper";

    private static final String LOG_SUCCESSFUL = "C:\\users\\dodinh\\desktop\\successful.txt";
    private static final String LOG_FAILED = "C:\\users\\dodinh\\desktop\\failed.txt";

    private static final String LRB = "-LRB-";
    private static final String RRB = "-RRB-";

    private static Connection connection;

    private static CqpEngine engine = new CqpEngine() {
        private static final long serialVersionUID = 1L;
        {
            setCorpusService(new CorpusService() {
                @Override
                public List<SearchEngine> listEngines(String aCorpusId) {
                    return null;
                }

                @Override
                public List<String> listCorpora() {
                    return null;
                }

                @Override
                public File getRepositoryPath() {
                    return new File(REPOSITORY);
                }

                @Override
                public Corpus getCorpus(String aCorpusId) {
                    return null;
                }
            });
            setCqpExecutable(new File(CQP_EXECUTABLE));
        }
    };

    public static void main(String[] args) {
        connect(HOST, DATABASE, USER, PASSWORD);

        Map<Integer, String> items = new HashMap<Integer, String>();
        Map<Integer, String> failed = new HashMap<Integer, String>();

        // fetch coveredTexts of dubious items and clean it
        PreparedStatement select = null;
        PreparedStatement update = null;
        try {
            StringBuilder selectQuery = new StringBuilder();
            selectQuery.append("SELECT * FROM cachedparse WHERE pennTree = 'ERROR' OR pennTree = ''");

            select = connection.prepareStatement(selectQuery.toString());
            log.info("Running query [" + selectQuery.toString() + "].");
            ResultSet rs = select.executeQuery();

            //         CSVWriter writer;
            String text;
            JCas jcas = JCasFactory.createJCas();
            String updateQuery = "UPDATE CachedParse SET pennTree = ? WHERE collectionId = ? AND documentId = ? AND beginOffset = ? AND endOffset = ?";
            update = connection.prepareStatement(updateQuery);
            //         File base = new File("");

            AnalysisEngine sentences = createEngine(DummySentenceSplitter.class);
            AnalysisEngine tokenizer = createEngine(StanfordSegmenter.class,
                    StanfordSegmenter.PARAM_CREATE_SENTENCES, false, StanfordSegmenter.PARAM_CREATE_TOKENS, true);
            AnalysisEngine parser = createEngine(StanfordParser.class, StanfordParser.PARAM_WRITE_CONSTITUENT, true,
                    //               StanfordParser.PARAM_CREATE_DEPENDENCY_TAGS, true,
                    StanfordParser.PARAM_WRITE_PENN_TREE, true, StanfordParser.PARAM_LANGUAGE, "en",
                    StanfordParser.PARAM_VARIANT, "factored");

            while (rs.next()) {
                String collectionId = rs.getString("collectionId");
                String documentId = rs.getString("documentId");
                int beginOffset = rs.getInt("beginOffset");
                int endOffset = rs.getInt("endOffset");
                text = retrieveCoveredText(collectionId, documentId, beginOffset, endOffset);

                jcas.setDocumentText(text);
                jcas.setDocumentLanguage("en");
                sentences.process(jcas);
                tokenizer.process(jcas);
                parser.process(jcas);

                //            writer = new CSVWriter(new FileWriter(new File(base, documentId + ".csv"));

                System.out.println("Updating " + text);
                for (PennTree p : JCasUtil.select(jcas, PennTree.class)) {
                    String tree = StringUtils.normalizeSpace(p.getPennTree());
                    update.setString(1, tree);
                    update.setString(2, collectionId);
                    update.setString(3, documentId);
                    update.setInt(4, beginOffset);
                    update.setInt(5, endOffset);
                    update.executeUpdate();
                    System.out.println("with tree " + tree);
                    break;
                }
                jcas.reset();
            }
        } catch (SQLException e) {
            log.error("Exception while selecting: " + e.getMessage());
        } catch (UIMAException e) {
            e.printStackTrace();
        } finally {
            closeQuietly(select);
            closeQuietly(update);
        }

        // write logs
        //      BufferedWriter bwf = null;
        //      BufferedWriter bws = null;
        //      try {
        //         bwf = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(
        //               LOG_FAILED)), "UTF-8"));
        //         for (Entry<Integer, String> e : failed.entrySet()) {
        //            bwf.write(e.getKey() + " - " + e.getValue() + "\n");
        //         }
        //
        //         bws = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(
        //               LOG_SUCCESSFUL)), "UTF-8"));
        //         for (Entry<Integer, String> e : items.entrySet()) {
        //            bws.write(e.getKey() + " - " + e.getValue() + "\n");
        //         }
        //      }
        //      catch (IOException e) {
        //         log.error("Got an IOException while writing the log files.");
        //      }
        //      finally {
        //         IOUtils.closeQuietly(bwf);
        //         IOUtils.closeQuietly(bws);
        //      }

        log.info("Texts for [" + items.size() + "] items need to be cleaned up.");

        // update the dubious items with the cleaned coveredText
        //      PreparedStatement update = null;
        //      try {
        //         String updateQuery = "UPDATE EvaluationItem SET coveredText = ? WHERE id = ?";
        //
        //         update = connection.prepareStatement(updateQuery);
        //         int i = 0;
        //         for (Entry<Integer, String> e : items.entrySet()) {
        //            int id = e.getKey();
        //            String coveredText = e.getValue();
        //
        //            // update item in database
        //            update.setString(1, coveredText);
        //            update.setInt(2, id);
        //            update.executeUpdate();
        //            log.debug("Updating " + id + " with [" + coveredText + "]");
        //
        //            // show percentage of updated items
        //            i++;
        //            int part = (int) Math.ceil((double) items.size() / 100);
        //            if (i % part == 0) {
        //               log.info(i / part + "% finished (" + i + "/" + items.size() + ").");
        //            }
        //         }
        //      }
        //      catch (SQLException e) {
        //         log.error("Exception while updating: " + e.getMessage());
        //      }
        //      finally {
        //         closeQuietly(update);
        //      }

        closeQuietly(connection);
    }

    private static void connect(String aHost, String aDatabase, String aUser, String aPassword) {
        String url = aHost + aDatabase + "?user=" + aUser + "&password=" + aPassword;

        try {
            Class.forName("com.mysql.jdbc.Driver");
            connection = DriverManager.getConnection(url);
        } catch (ClassNotFoundException e) {
            throw new RuntimeException("Failed to load the specified database driver.", e);
        } catch (SQLException e) {
            throw new RuntimeException("There was an unrecoverable error while connecting to the database.", e);
        }
    }

    /**
     * Using a workaround for java < 1.7.
     */
    private static void closeQuietly(Object aAutoCloseable) {
        try {
            if (aAutoCloseable != null) {
                if (aAutoCloseable instanceof Connection) {
                    ((Connection) aAutoCloseable).close();
                }
                if (aAutoCloseable instanceof Statement) {
                    ((Statement) aAutoCloseable).close();
                }
            }
        } catch (Exception e) {
            log.error("There was an unrecoverable error while closing [" + aAutoCloseable + "].", e);
        }
    }

    private static String retrieveCoveredText(final String aCollectionId, String aDocumentId, int aBeginOffset,
            int aEndOffset) {
        String coveredText;

        CqpQuery query = new CqpQuery(engine, "", aCollectionId);
        query.setContext(0, 0, ContextUnit.CHARACTER);
        String queryString = "[begin=\"" + aBeginOffset + "\"] []* [end=\"" + aEndOffset + "\"] :: match.text_id=\""
                + aDocumentId + "\"";
        log.trace(queryString);

        query.runQuery(queryString);

        if (query.size() > 1) {
            log.warn("More than 1 entry found: " + query);
        }
        if (query.size() == 0) {
            log.error("Nothing found for: " + queryString);
            query.close();
            throw new IllegalArgumentException(queryString);
        }
        coveredText = query.cat(1).get(0).getCoveredText();
        query.close();

        return coveredText;
    }
}