classifier.page.CreateWekaInput.java Source code

Java tutorial

Introduction

Here is the source code for classifier.page.CreateWekaInput.java

Source

/*
############################################################################
##
## Copyright (C) 2006-2009 University of Utah. All rights reserved.
##
## This file is part of DeepPeep.
##
## This file may be used under the terms of the GNU General Public
## License version 2.0 as published by the Free Software Foundation
## and appearing in the file LICENSE.GPL included in the packaging of
## this file.  Please review the following to ensure GNU General Public
## Licensing requirements will be met:
## http://www.opensource.org/licenses/gpl-license.php
##
## If you are unsure which license is appropriate for your use (for
## instance, you are interested in developing a commercial derivative
## of DeepPeep), please contact us at deeppeep@sci.utah.edu.
##
## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
##
############################################################################
*/
package classifier.page;

import classifier.util.vsm.VSMElement;
import classifier.util.vsm.VSMVector;
import classifier.util.vsm.VSMElementComparator;
import classifier.util.parser.PaginaURL;
import classifier.util.string.StopList;
import classifier.util.FileComparator;

import java.io.File;
import org.xml.sax.SAXException;
import java.net.MalformedURLException;
import java.net.URL;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.DataOutputStream;
import java.io.FileOutputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.Collections;
import java.util.Random;

import weka.classifiers.functions.SMO;

/**
 * <p> </p>
 *
 * <p>Description: </p>
 *
 * <p>Copyright: Copyright (c) 2004</p>
 *
 * <p> </p>
 *
 * @author Luciano Barbosa
 * @version 1.0
 */
public class CreateWekaInput {

    protected VSMVector[][] trainingExamples = null;
    //   protected VSMVector[] negativeExamples = null;

    protected VSMVector[][] testExamples = null;
    //   protected VSMVector[] negativeTestExamples = null;

    protected int numOfFeatures = Integer.MAX_VALUE;

    protected int minDF = 5;

    protected HashMap df = new HashMap();

    protected boolean isForm = false;

    protected StopList stoplist;

    public CreateWekaInput(File dir, File dirTest, StopList stoplist) throws SAXException, IOException {
        this(dir, dirTest, stoplist, Integer.MAX_VALUE);
    }

    public CreateWekaInput(File input, File inputTest, StopList stoplist, int numOfElems)
            throws SAXException, IOException {
        trainingExamples = new VSMVector[2][];
        this.stoplist = stoplist;
        if ((new File(input + File.separator + "positive")).isDirectory()) {
            File[] positiveFiles = new File(input + File.separator + "positive").listFiles();
            System.out.println("POSITIVE:" + positiveFiles.length);
            File[] negativeFiles = new File(input + File.separator + "negative").listFiles();
            System.out.println("NEGATIVE:" + negativeFiles.length);
            int[] negIndexes = selectRandomNum(1, negativeFiles.length, numOfElems);
            trainingExamples[1] = createVSM(negativeFiles, stoplist, negIndexes, true);
            int[] posIndexes = selectRandomNum(1, positiveFiles.length, numOfElems);
            trainingExamples[0] = createVSM(positiveFiles, stoplist, posIndexes, true);
        } else {
            trainingExamples[0] = createVSM(new File(input + File.separator + "positive"), stoplist);
            trainingExamples[1] = createVSM(new File(input + File.separator + "negative"), stoplist);
        }
        if (inputTest != null) {
            testExamples = new VSMVector[2][];
            if ((new File(inputTest + File.separator + "positive")).isDirectory()) {
                File temp = new File(inputTest + File.separator + "positive");
                System.out.println(temp.toString());
                File[] positiveTestFiles = temp.listFiles();
                trainingExamples[0] = createVSM(positiveTestFiles, stoplist, false);
                File[] negativeTestFiles = new File(inputTest + File.separator + "negative").listFiles();
                trainingExamples[1] = createVSM(negativeTestFiles, stoplist, false);
            } else {
                trainingExamples[0] = createVSM(new File(inputTest + File.separator + "positive"), stoplist);
                trainingExamples[1] = createVSM(new File(inputTest + File.separator + "negative"), stoplist);
            }
        }
    }

    public CreateWekaInput(String[][] pages, StopList stoplist, int size) throws SAXException, IOException {
        trainingExamples = new VSMVector[size][];
        for (int i = 0; i < size; i++) {
            String[] levelPages = pages[i];
            trainingExamples[i] = createVSM(levelPages, stoplist);
        }
    }

    private int[] selectRandomNum(long seed, int range, int elems) {
        if (elems > range) {
            elems = range;
        }
        int count = 0;
        Random random = new Random(seed);
        int next = random.nextInt(range);
        HashSet nums = new HashSet();
        int[] result = new int[elems];
        while (count < elems) {
            Integer num = new Integer(next);
            if (!nums.contains(num)) {
                result[count] = next;
                nums.add(num);
                count++;
            }
            next = random.nextInt(range);
        }
        return result;
    }

    protected VSMVector[] createVSM(String[] pages, StopList stoplist) throws SAXException {
        Vector<VSMVector> tempVSM = new Vector<VSMVector>();
        for (int i = 0; i < pages.length; i++) {
            try {
                if (pages[i] == null) {
                    continue;
                }
                VSMVector vsm = new VSMVector(pages[i], stoplist);
                tempVSM.add(vsm);
                Iterator iterator1 = vsm.getElements();
                while (iterator1.hasNext()) {
                    VSMElement elem = (VSMElement) iterator1.next();
                    VSMElement value = (VSMElement) df.get(elem.getWord());
                    if (value == null) {
                        df.put(elem.getWord(), new VSMElement(elem.getWord(), 1));
                    } else {
                        df.put(elem.getWord(), new VSMElement(elem.getWord(), value.getWeight() + 1));
                    }
                }
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }
        VSMVector[] examples = new VSMVector[tempVSM.size()];
        tempVSM.toArray(examples);
        return examples;
    }

    protected VSMVector[] createVSM(File file, StopList stoplist) throws SAXException {
        Vector<VSMVector> tempVSM = new Vector<VSMVector>();
        try {
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
            for (String line = reader.readLine(); line != null; line = reader.readLine()) {
                VSMVector vsm = new VSMVector(line, stoplist);
                tempVSM.add(vsm);
                Iterator iterator1 = vsm.getElements();
                while (iterator1.hasNext()) {
                    VSMElement elem = (VSMElement) iterator1.next();
                    VSMElement value = (VSMElement) df.get(elem.getWord());
                    if (value == null) {
                        df.put(elem.getWord(), new VSMElement(elem.getWord(), 1));
                    } else {
                        df.put(elem.getWord(), new VSMElement(elem.getWord(), value.getWeight() + 1));
                    }
                }
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        VSMVector[] examples = new VSMVector[tempVSM.size()];
        tempVSM.toArray(examples);
        return examples;
    }

    protected VSMVector[] createVSM(File[] files, StopList stoplist, int[] indexes, boolean addToFeatures)
            throws SAXException {
        Vector<VSMVector> tempVSM = new Vector<VSMVector>();
        for (int i = 0; i < files.length && i < indexes.length; i++) {
            try {
                VSMVector vsm = new VSMVector(files[indexes[i]].toString(), isForm, stoplist);
                tempVSM.add(vsm);
                if (addToFeatures) {
                    Iterator iterator1 = vsm.getElements();
                    while (iterator1.hasNext()) {
                        VSMElement elem = (VSMElement) iterator1.next();
                        VSMElement value = (VSMElement) df.get(elem.getWord());
                        if (value == null) {
                            df.put(elem.getWord(), new VSMElement(elem.getWord(), 1));
                        } else {
                            df.put(elem.getWord(), new VSMElement(elem.getWord(), value.getWeight() + 1));
                        }
                    }
                }
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }
        VSMVector[] examples = new VSMVector[tempVSM.size()];
        tempVSM.toArray(examples);
        return examples;
    }

    protected VSMVector[] createVSM(File[] files, StopList stoplist, boolean addToFeatures)
            throws IOException, SAXException {
        int[] indexes = new int[files.length];
        for (int i = 0; i < indexes.length; i++) {
            indexes[i] = i;
        }
        return createVSM(files, stoplist, indexes, addToFeatures);
    }

    Vector<String> attributes = new Vector<String>();

    public String[] centroid2Weka(String output) throws FileNotFoundException, IOException {
        OutputStream fout = new FileOutputStream(output, false);
        OutputStream bout = new BufferedOutputStream(fout);
        OutputStreamWriter outputFile = new OutputStreamWriter(bout);
        StringBuffer header = new StringBuffer();
        header.append("@RELATION TSFC");
        header.append("\n");
        header.append("\n");
        StringBuffer tail = new StringBuffer();

        Vector bestWordsForm = new Vector(df.values());
        Collections.sort(bestWordsForm, new VSMElementComparator());
        for (int i = 0; i <= numOfFeatures && i < bestWordsForm.size(); i++) {
            VSMElement elem = (VSMElement) bestWordsForm.elementAt(i);
            if (elem.getWeight() > minDF) {
                header.append("@ATTRIBUTE ");
                if (elem.getWord().equals("class")) {
                    //This is a hack, weka does not allow attribute with name class.
                    elem.setWord("class-random-string");
                }
                header.append(elem.getWord());
                attributes.add(elem.getWord());
                header.append(" REAL");
                header.append("\n");
            }
        }
        header.append("@ATTRIBUTE class {");
        for (int i = 0; i < trainingExamples.length - 1; i++) {
            header.append("CLASS_" + i + ",");
        }
        header.append("CLASS_" + (trainingExamples.length - 1) + "}");

        tail.append("\n");
        tail.append("\n");
        tail.append("@DATA");
        tail.append("\n");
        for (int l = 0; l < trainingExamples.length; l++) {
            for (int i = 0; i < trainingExamples[l].length; i++) {
                VSMVector formTemp = trainingExamples[l][i];
                tail.append("{");
                for (int j = 0; j < attributes.size(); j++) {
                    VSMElement elemForm = formTemp.getElement(attributes.elementAt(j));
                    if (elemForm != null) {
                        tail.append(j);
                        tail.append(" ");
                        tail.append((int) elemForm.getWeight());
                        tail.append(",");
                    }
                }
                tail.append(attributes.size() + " CLASS_" + l + "}");
                tail.append("\n");
            }
        }
        outputFile.write(header.toString());
        outputFile.flush();
        outputFile.write(tail.toString());
        outputFile.close();
        if (testExamples != null) {
            createTestFile(output, bestWordsForm, header);
        }
        String[] atts = new String[attributes.size()];
        attributes.toArray(atts);
        return atts;
    }

    private void createTestFile(String output, Vector bestWordsForm, StringBuffer header)
            throws FileNotFoundException, IOException {
        OutputStream fout = new FileOutputStream(output + "_test", false);
        OutputStream bout = new BufferedOutputStream(fout);
        OutputStreamWriter outputFile = new OutputStreamWriter(bout);
        StringBuffer tail = new StringBuffer();
        tail.append("\n");
        tail.append("\n");
        tail.append("@DATA");
        tail.append("\n");
        for (int l = 0; l < testExamples.length; l++) {
            for (int i = 0; i < testExamples[l].length; i++) {
                VSMVector examples = testExamples[l][i];
                tail.append("{");
                for (int j = 0; j < attributes.size(); j++) {
                    VSMElement elemForm = examples.getElement(attributes.elementAt(j));
                    if (elemForm != null) {
                        tail.append(j);
                        tail.append(" ");
                        tail.append((int) elemForm.getWeight());
                        tail.append(",");
                    }
                }
                tail.append(attributes.size() + " CLASS_" + l + "}");
                tail.append("\n");
            }
        }
        outputFile.write(header.toString());
        outputFile.flush();
        outputFile.write(tail.toString());
        outputFile.close();
    }

    public static void main(String[] args) {
        StopList st = null;
        try {
            String cfgDir = args[0];
            String stoplistFile = cfgDir + "/stoplist.txt";
            st = new classifier.util.string.StopListArquivo(stoplistFile);
            String trainingPath = args[1];
            File dir = new File(trainingPath);
            File dirTest = null;
            CreateWekaInput createwekainput = new CreateWekaInput(dir, dirTest, st);
            String outputPath = args[2];
            createwekainput.centroid2Weka(trainingPath + "/weka.arff");

            SMO.main(new String[] { "-M", "-d", outputPath + "/pageclassifier.model", "-t",
                    trainingPath + "/weka.arff" });
        } catch (MalformedURLException ex1) {
            ex1.printStackTrace();
        } catch (IOException ex1) {
            ex1.printStackTrace();
        } catch (SAXException ex1) {
            ex1.printStackTrace();
        }
    }

}