edu.utsa.sifter.SearchHit.java Source code

Introduction

Here is the source code for edu.utsa.sifter.SearchHit.java
Source

/**
 *
 * Sifter - Search Indexes for Text Evidence Relevantly
 *
 * Copyright (C) 2013, University of Texas at San Antonio (UTSA)
 *
 * Sifter is a digital forensics and e-discovery tool for conducting
 * text based string searches.  It clusters and ranks search hits
 * to improve investigative efficiency. Hit-level ranking uses a 
 * patent-pending ranking algorithm invented by Dr. Nicole Beebe at UTSA.
 *  
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * @author Jon Stewart, Lightbox Technologies
**/

package edu.utsa.sifter;

import org.codehaus.jackson.annotate.JsonProperty;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;

import org.apache.lucene.search.postingshighlight.Passage;
import org.apache.lucene.util.BytesRef;

import org.apache.commons.lang3.StringEscapeUtils;

public class SearchHit implements Comparable<SearchHit> {
    private final Result DocData;

    @JsonProperty
    public float Score;

    @JsonProperty
    public String Passage;

    @JsonProperty
    public int Start;

    @JsonProperty
    public int End;

    @JsonProperty
    public String ID() {
        return DocData.ID;
    }

    @JsonProperty
    public String Name() {
        return DocData.Name;
    }

    @JsonProperty
    public String Path() {
        return DocData.Path;
    }

    @JsonProperty
    public String Extension() {
        return DocData.Extension;
    }

    @JsonProperty
    public long Size() {
        return DocData.Size;
    }

    @JsonProperty
    public long Modified() {
        return DocData.Modified;
    }

    @JsonProperty
    public long Accessed() {
        return DocData.Accessed;
    }

    @JsonProperty
    public long Created() {
        return DocData.Created;
    }

    @JsonProperty
    public String Body() {
        return DocData.Body;
    }

    @JsonProperty
    public String Cell() {
        return DocData.Cell;
    }

    @JsonProperty
    public double CellDistance() {
        return DocData.CellDistance;
    }

    private int MaxTermLen = 0;

    public SearchHit(final Result doc, final Passage p, final String body) {
        DocData = doc;
        Start = p.getStartOffset();
        End = p.getEndOffset();

        final int n = p.getNumMatches();
        if (n > 0) {
            final StringBuilder sb = new StringBuilder();
            final int[] matchStarts = p.getMatchStarts();
            final int[] matchEnds = p.getMatchEnds();
            final BytesRef[] terms = p.getMatchTerms();
            int curPos = Math.min(Start, matchStarts[0]);
            for (int i = 0; i < n; ++i) {
                sb.append(StringEscapeUtils.escapeHtml4(body.substring(curPos, matchStarts[i])));
                sb.append("<span class=\"secondarycolorbg\">");
                sb.append(StringEscapeUtils.escapeHtml4(body.substring(matchStarts[i], matchEnds[i])));
                sb.append("</span>");
                curPos = matchEnds[i];
                MaxTermLen = Math.max(MaxTermLen, terms[i].length);
            }
            sb.append(StringEscapeUtils.escapeHtml4(body.substring(curPos, End)));
            Passage = sb.toString();
        } else {
            Passage = body.substring(Start, End);
        }
    }

    public boolean isUnallocated() {
        return DocData.isUnallocated();
    }

    public int compareTo(final SearchHit o) {
        final float diff = o.Score - Score;
        if (diff > 0.0f) {
            return 1;
        } else if (diff < 0.0f) {
            return -1;
        } else {
            return 0;
        }
    }

    public float calculateScore(final double[] features, final double[] weights, final int maxTermLen,
            final int distance, final double hitFreq, final double tfidf) {
        if (features.length != weights.length) {
            throw new RuntimeException("lengths of features and weights arrays differed");
        }
        features[HitRanker.FTERM_TFIDF] = tfidf;
        features[HitRanker.FHIT_FREQUENCY] = hitFreq;
        features[HitRanker.FHIT_PROXIMITY] = (double) distance / (DocData.BodyLen == 0 ? 1 : DocData.BodyLen);
        features[HitRanker.FTERM_LENGTH] = (double) MaxTermLen / maxTermLen;
        features[HitRanker.FTERM_PRIORITY] = 0.0;
        features[HitRanker.FUNUSED] = 0.0;
        features[HitRanker.FHIT_OFFSET] = (double) Start / (DocData.BodyLen == 0 ? 1 : DocData.BodyLen);

        double sum = 0.0f;
        for (int i = 0; i < features.length; ++i) {
            sum += features[i] * weights[i];
        }
        Score = (float) sum;
        return Score;
    }

    public void normalize(final double min, final double range) {
        final double newScore = 10 * ((Score - min) / (range == 0 ? 1 : range));
        if (newScore < 0) {
            System.err.println("Negative score on " + DocData.fullpath() + "! newScore = " + newScore
                    + ", old score = " + DocData.Score + ", min = " + min + ", range = " + range);
        }
        Score = (float) newScore;
    }
}