edu.psu.citeseerx.disambiguation.CsxDisambiguation.java Source code

Java tutorial

Introduction

Here is the source code for edu.psu.citeseerx.disambiguation.CsxDisambiguation.java

Source

/*
 * Copyright 2007 Penn State University
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.psu.citeseerx.disambiguation;

import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;

import javax.sql.DataSource;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.springframework.beans.factory.ListableBeanFactory;

import edu.psu.citeseerx.loaders.ContextReader;
import edu.psu.citeseerx.disambiguation.CsxAuthorBlock;
import edu.psu.citeseerx.disambiguation.CsxAuthorFilter;
import edu.psu.citeseerx.disambiguation.dbscan.DBScan;
import edu.psu.citeseerx.disambiguation.dao.CsxAuthor;

/**
 * CsxDisambiguation
 * 
 * @author Puck Treeratpituk
 * @version $Rev$ $Date$
 */
public class CsxDisambiguation {

    public static final double EPS = 0.1;
    public static final int MIN_PTS = 2;

    public static void initDirectories(String dirpath) throws Exception {
        for (char a = 'A'; a <= 'Z'; a++) {
            for (char b = 'A'; b <= 'Z'; b++) {
                File dir = new File(dirpath + "/" + a + b);
                if (!dir.exists()) {
                    dir.mkdirs();
                }
            }
        }
    }

    public static void createBlocks(ListableBeanFactory factory) throws Exception {
        String dirpath = "data/csauthors/blocks";

        DataSource dataSource = (DataSource) factory.getBean("csxDataSource");

        PreparedStatement st = dataSource.getConnection().prepareStatement("SELECT * FROM authors",
                ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
        st.setFetchSize(Integer.MIN_VALUE);
        ResultSet rs = st.executeQuery();

        initDirectories(dirpath);

        CsxAuthorFilter filter = (CsxAuthorFilter) factory.getBean("csxAuthorFilter");
        //new CsxAuthorFilter("data/csauthors/name_stopwords.txt");
        BufferedWriter skip = new BufferedWriter(new FileWriter("skip.txt"));

        int count = 0;
        Map<String, List<String>> blocks = new HashMap<String, List<String>>();
        while (rs.next()) {
            count++;
            if ((count % 10000) == 0)
                System.out.println("#Auth:" + count);
            String rsname = rs.getString("name");
            if (!filter.isStopword(rsname) && !filter.isInstitute(rsname) && !filter.isPosition(rsname)) {

                CsxAuthor auth = new CsxAuthor(rs);
                String lname = auth.getLastName();
                String fname = auth.getFirstName();

                if ((lname != null) && (fname != null)) {
                    if ((lname.charAt(0) >= 'A') && (lname.charAt(0) <= 'Z') && (fname.charAt(0) >= 'A')
                            && (fname.charAt(0) <= 'Z') && !((fname.length() == 1) && (lname.length() == 1))
                            && !(lname.matches(".*/.*"))) {

                        String l_init = lname.substring(0, 1).toUpperCase();
                        String f_init = fname.substring(0, 1).toUpperCase();
                        String key = l_init + f_init + "/" + lname.toLowerCase() + "_" + f_init.toLowerCase()
                                + ".txt";

                        List<String> list;
                        if (!blocks.containsKey(key)) {
                            list = new ArrayList<String>();
                            blocks.put(key, list);
                        } else {
                            list = blocks.get(key);
                        }
                        list.add(auth.getId());
                    } else {
                        skip.write("SKIP: [" + rsname + "]\n");
                    }
                }
            }
        }
        skip.close();

        for (String key : blocks.keySet()) {
            List<String> aids = blocks.get(key);
            // only care about cluster with more than one document
            if (aids.size() > 1) {
                BufferedWriter out = new BufferedWriter(new FileWriter(dirpath + "/" + key));
                for (String aid : aids) {
                    out.write(aid + "\n");
                }
                out.close();
            }
        }
    }

    public static void disambiguate(ListableBeanFactory factory, String infile, String outfile) throws Exception {

        CsxAuthorBlock block = (CsxAuthorBlock) factory.getBean("csxAuthorBlock");
        block.loadAuthors(infile);

        int min_pts = MIN_PTS;
        if (block.points.size() <= 3)
            min_pts = 1;
        DBScan dbscan = new DBScan(block, EPS, min_pts);
        dbscan.run();
        dbscan.printResults(outfile);
    }

    public static void disambiguateFile(ListableBeanFactory factory, String infile) throws Exception {
        //CsxAuthorBlock block = new CsxAuthorBlock(conn, rconn, "jian_huang2.block");
        //CsxAuthorBlock block = new CsxAuthorBlock(conn, rconn, "data/csauthors/blocks/MP/mitra_p.txt");
        //CsxAuthorBlock block = new CsxAuthorBlock(conn, rconn, "data/csauthors/blocks/BK/borner_k.txt");

        CsxAuthorBlock block = (CsxAuthorBlock) factory.getBean("csxAuthorBlock");
        block.loadAuthors(infile);

        int min_pts = MIN_PTS;
        if (block.points.size() <= 3)
            min_pts = 1;
        DBScan dbscan = new DBScan(block, EPS, min_pts);
        dbscan.run();
        dbscan.printResults();
    }

    public static void disambiguateDirectory(ListableBeanFactory factory, String indir, String outdir)
            throws Exception {

        File dir = new File(indir);
        String[] files = dir.list();
        Arrays.sort(files);
        for (String file : files) {
            String infile = indir + "/" + file;
            String outfile = outdir + "/" + file.replace(".txt", ".out");
            System.out.println("> " + file);
            disambiguate(factory, infile, outfile);
        }
    }

    private static void usage(Options options) {
        HelpFormatter formatter = new HelpFormatter();
        System.out.println("");
        formatter.printHelp("CsxDisambiguation", options);
        System.out.println("");
    }

    private static Options setupOptions() {
        Options options = new Options();

        options.addOption("help", false, "print help message");

        Option cmd = OptionBuilder.withArgName("cmd").hasArg().withDescription("init_dirs, init_blocks, dbscan")
                .create("cmd");
        Option infile = OptionBuilder.withArgName("file").hasArg().withDescription("a single input file")
                .create("infile");
        Option indir = OptionBuilder.withArgName("indir").hasArg()
                .withDescription("directory containing the input files").create("indir");
        Option outdir = OptionBuilder.withArgName("outdir").hasArg()
                .withDescription("directory to put the output files").create("outdir");
        options.addOption(cmd);
        options.addOption(infile);
        options.addOption(indir);
        options.addOption(outdir);
        return options;
    }

    public static boolean validOptions(CommandLine line, Options options) {
        if (!line.hasOption("cmd")) {
            System.out.println("\nERROR: Please specify one command");
            return false;
        }
        String cmd = line.getOptionValue("cmd");
        if (cmd.equals("dbscan") && !(line.hasOption("infile") ^ line.hasOption("indir"))) {
            System.out.println("\nERROR: Please specify either -infile or -indir, but not both");
            return false;
        }
        return true;
    }

    public static void main(String[] args) {
        Date start = new Date();
        System.out.println("START:" + start);
        try {
            CommandLineParser cmdparser = new GnuParser();
            Options options = setupOptions();
            CommandLine line = cmdparser.parse(options, args);

            ListableBeanFactory factory = ContextReader.loadContext();
            if (!validOptions(line, options)) {
                usage(options);
                System.exit(0);
            }
            String cmd = line.getOptionValue("cmd");
            String infile = line.getOptionValue("infile");
            String indir = line.getOptionValue("indir");
            String outdir = line.getOptionValue("outdir");
            if (cmd.equals("init_dirs")) {
                // 1) init directories
                initDirectories("data/csauthors/blocks");
                initDirectories("data/csauthors/output");
            } else if (cmd.equals("init_blocks")) {
                // 2) create blocks
                createBlocks(factory);
            } else if (cmd.equals("dbscan")) {
                // 3) disambiguate (required 1. & 2.)
                //disambiguateDirectory(factory, args[0], args[1]);
                //disambiguateFile(factory, args[0]);
                if (infile != null) {
                    disambiguateFile(factory, infile);
                } else if (indir != null && outdir != null) {
                    disambiguateDirectory(factory, indir, outdir);
                } else {
                    usage(options);
                    System.exit(0);
                }
            }
            /*else if (cmd.equals("match_author")) {
               String input_file = "";
               disambiguateOneAuthor(input_file);
               }*/
        } catch (Exception ex) {
            ex.printStackTrace();
            Date now = new Date();
            System.out.println("CRASH:" + now);
        }
        Date end = new Date();
        System.out.println("END:" + end);
        System.out.println("TIME:" + (end.getTime() - start.getTime()));
    }
}