de.isabeldrostfromm.sof.naive.Vectoriser.java Source code

Java tutorial

Introduction

Here is the source code for de.isabeldrostfromm.sof.naive.Vectoriser.java

Source

/**
 * Copyright (C) 2013 Isabel Drost-Fromm
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.isabeldrostfromm.sof.naive;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder;

import de.isabeldrostfromm.sof.util.Vectors;

/**
 * Vectorisation based on LuceneTextValueEncoder for body, title and tags.
 * */
public class Vectoriser {
    /** Cardinality of the vector portion to use for encoding posting bodies. */
    public static final int bodyCard = 1000000;
    /** Cardinality of the vector portion to use for encoding posting titles. */
    public static final int titleCard = 1000000;
    /** Cardinality of the vector portion to use for encoding posting tags. */
    private static final int tagCard = 0;
    /** Number of single double values to encode */
    private static final int doubles = 0;

    public static int getCardinality() {
        return bodyCard + titleCard + tagCard + doubles;
    }

    /**
     * Turn a document bean into a vector.
     * @param document the document to turn in a vector.
     * @return the resulting vector.
     * */
    public Vector vectorise(Document document) {
        Vector body = luceneEncode(bodyCard, document.getBody());
        Vector title = luceneEncode(titleCard, document.getTitle());
        //Vector tags = luceneEncode(tagCard, Strings.collectionToCommaDelimitedString(document.getTags()));
        //Vector reputation = Vectors.newSequentialAccessSparseVector(document.getReputation());

        return Vectors.append(body, title);//, tags);//, reputation);
    }

    /**
     * @return Returns a vector generated for the given text based on encoding with LuceneTextValueEncoder
     * */
    private static Vector luceneEncode(int probes, String text) {
        LuceneTextValueEncoder encoder = new LuceneTextValueEncoder("sof");
        encoder.setAnalyzer(new StandardAnalyzer(Version.LUCENE_36));
        encoder.setProbes(probes);
        encoder.addText(text);
        Vector vector = new SequentialAccessSparseVector(probes);
        encoder.flush(1, vector);
        return vector;
    }

}