uk.gov.nationalarchives.discovery.taxonomy.common.service.impl.TSetBasedCategoriserServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for uk.gov.nationalarchives.discovery.taxonomy.common.service.impl.TSetBasedCategoriserServiceImpl.java

Source

/** 
 * Copyright (c) 2015, The National Archives
 * http://www.nationalarchives.gov.uk 
 * 
 * This Source Code Form is subject to the terms of the Mozilla Public 
 * License, v. 2.0. If a copy of the MPL was not distributed with this 
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */
package uk.gov.nationalarchives.discovery.taxonomy.common.service.impl;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.exception.TaxonomyErrorType;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.exception.TaxonomyException;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.repository.lucene.InformationAssetView;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.repository.lucene.InformationAssetViewFields;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.repository.mongo.CategoryWithLuceneQuery;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.repository.mongo.IAViewUpdate;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.service.TSetBasedCategorisationResult;
import uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.IAViewRepository;
import uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.tools.LuceneHelperTools;
import uk.gov.nationalarchives.discovery.taxonomy.common.service.CategoriserService;

import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.Field;
import java.util.*;

/**
 * class dedicated to the categorisation of documents<br/>
 * use the More Like This feature of Lucene
 *
 */
@Service("categoriserService")
@ConditionalOnProperty(prefix = "lucene.categoriser.", value = "useTSetBasedCategoriser")
public class TSetBasedCategoriserServiceImpl implements CategoriserService<TSetBasedCategorisationResult> {

    private static final Logger logger = LoggerFactory.getLogger(TSetBasedCategoriserServiceImpl.class);

    @Autowired
    private IndexReader iaViewIndexReader;

    @Autowired
    private IndexReader trainingSetIndexReader;

    @Autowired
    private IAViewRepository iaViewRepository;

    @Autowired
    private SearcherManager trainingSetSearcherManager;

    @Autowired
    private Analyzer trainingSetAnalyser;

    @Value("${lucene.mlt.mimimumScoreForMlt}")
    private float mimimumScoreForMlt;
    @Value("${lucene.mlt.mimimumGlobalScoreForACategory}")
    private float mimimumGlobalScoreForACategory;
    @Value("${lucene.mlt.maximumSimilarElements}")
    private int maximumSimilarElements;

    @Value("${lucene.mlt.minTermFreq}")
    private int minTermFreq;

    @Value("${lucene.mlt.minDocFreq}")
    private int minDocFreq;

    @Value("${lucene.mlt.descBoostingFactor}")
    private float descBoostingFactor;

    @Value("${lucene.mlt.contextDescBoostingFactor}")
    private float contextDescBoostingFactor;

    @Value("${lucene.mlt.titleBoostingFactor}")
    private float titleBoostingFactor;

    @Value("${lucene.mlt.fieldsToAnalyse}")
    private String fieldsToAnalyse;

    /**
     * run More Like This process on a document by comparing its description to
     * the description of all items of the training set<br/>
     * currently we get a fixed number of the top results
     * 
     * @param document
     *            document being tested
     * @return
     * @throws IOException
     */
    public List<TSetBasedCategorisationResult> runMlt(Document document) {

        Map<String, TSetBasedCategorisationResult> result = null;
        IndexSearcher searcher = null;
        try {
            trainingSetSearcherManager.maybeRefresh();
            // Boolean wasRefreshed = trainingSetSearcherManager.maybeRefresh();
            // if (wasRefreshed) {
            // logger.debug(".runMlt: training set searcher had to be refreshed");
            // }
            searcher = trainingSetSearcherManager.acquire();

            // TODO TSETBASED refresh reader/searcher: Use readermanager and
            // refresh it?
            MoreLikeThis moreLikeThis = new MoreLikeThis(this.trainingSetIndexReader);
            moreLikeThis.setMinTermFreq(minTermFreq);
            moreLikeThis.setMinDocFreq(minDocFreq);
            moreLikeThis.setAnalyzer(this.trainingSetAnalyser);
            moreLikeThis.setFieldNames(fieldsToAnalyse.split(","));
            moreLikeThis.setBoost(true);

            BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();

            for (String fieldName : fieldsToAnalyse.split(",")) {
                String value = document.get(fieldName);
                if (value != null && !"null".equals(value)) {

                    switch (InformationAssetViewFields.valueOf(fieldName)) {
                    case DESCRIPTION:
                        moreLikeThis.setBoostFactor(descBoostingFactor);
                        break;
                    case TITLE:
                        moreLikeThis.setBoostFactor(titleBoostingFactor);
                        break;
                    case CONTEXTDESCRIPTION:
                        moreLikeThis.setBoostFactor(contextDescBoostingFactor);
                        break;
                    default:
                    case SUBJECTS:
                    case CORPBODYS:
                    case PERSON_FULLNAME:
                    case PLACE_NAME:
                        moreLikeThis.setBoostFactor(1);
                        break;
                    }
                    Query query = moreLikeThis.like(fieldName, new StringReader(value));
                    queryBuilder.add(query, Occur.SHOULD);
                }
            }
            BooleanQuery fullQuery = queryBuilder.build();

            TopDocs topDocs = searcher.search(fullQuery, this.maximumSimilarElements);
            logger.debug(".runMlt: found {} total hits, processed at maximum {} hits", topDocs.totalHits,
                    this.maximumSimilarElements);

            result = new LinkedHashMap<String, TSetBasedCategorisationResult>();

            int size = 0;
            if (topDocs.totalHits <= this.maximumSimilarElements) {
                size = topDocs.totalHits - 1;
            } else {
                size = this.maximumSimilarElements - 1;
            }

            for (int i = 0; i < size; i++) {
                ScoreDoc scoreDoc = topDocs.scoreDocs[i];
                Float currrentScore = scoreDoc.score;

                if (currrentScore < this.mimimumScoreForMlt) {
                    break;
                }

                Document hitDoc = searcher.doc(scoreDoc.doc);
                String category = hitDoc.get(InformationAssetViewFields.TAXONOMY.toString());
                String docReference = hitDoc.get(InformationAssetViewFields.DOCREFERENCE.toString());
                logger.debug(".runMlt: found doc, category: {}, score: {}, docreference: {}", category,
                        currrentScore, docReference);

                TSetBasedCategorisationResult existingCategorisationResult = result.get(category);
                Float scoreToSet = currrentScore;
                Integer numberOfFoundDocuments = 1;
                // k nearest neighbour algorithm
                if (existingCategorisationResult != null) {
                    scoreToSet += existingCategorisationResult.getScore();
                    numberOfFoundDocuments += existingCategorisationResult.getNumberOfFoundDocuments();
                }
                result.put(category,
                        new TSetBasedCategorisationResult(category, scoreToSet, numberOfFoundDocuments));

            }

        } catch (IOException e) {
            throw new TaxonomyException(TaxonomyErrorType.LUCENE_IO_EXCEPTION, e);
        } finally {
            LuceneHelperTools.releaseSearcherManagerQuietly(trainingSetSearcherManager, searcher);
        }

        List<TSetBasedCategorisationResult> sortedResults = sortCategorisationResultsByScoreDescAndFilterByGlobalScore(
                new ArrayList<TSetBasedCategorisationResult>(result.values()));

        return sortedResults;
    }

    private List<TSetBasedCategorisationResult> sortCategorisationResultsByScoreDescAndFilterByGlobalScore(
            List<TSetBasedCategorisationResult> categorisationResults) {
        // Sort results by Score in descending Order
        Collections.sort(categorisationResults, new Comparator<TSetBasedCategorisationResult>() {
            public int compare(TSetBasedCategorisationResult a, TSetBasedCategorisationResult b) {
                return b.getScore().compareTo(a.getScore());
            }
        });

        // add entries to the linkedList to return. Do not add entries below the
        // minimum global score for a category
        List<TSetBasedCategorisationResult> sortedResults = new LinkedList<TSetBasedCategorisationResult>();
        for (TSetBasedCategorisationResult entry : categorisationResults) {
            if (entry.getScore() < this.mimimumGlobalScoreForACategory) {
                break;
            }
            sortedResults.add(entry);
        }
        return sortedResults;
    }

    public List<TSetBasedCategorisationResult> testCategoriseSingle(InformationAssetView iaView) {

        logger.info(".testCategoriseSingle: catdocref:{}, docreference:{} ", iaView.getCATDOCREF(),
                iaView.getDOCREFERENCE());
        Document doc = new Document();
        try {
            for (Field field : iaView.getClass().getDeclaredFields()) {
                field.setAccessible(true);
                String fieldName = field.getName();

                if (CollectionUtils.contains(Arrays.asList(fieldsToAnalyse.split(",")).iterator(), fieldName)) {
                    String value = String.valueOf(field.get(iaView));
                    if (value != null && !"null".equals(value)) {
                        doc.add(new TextField(fieldName, value, Store.YES));
                    }
                }
            }

        } catch (IllegalArgumentException e) {
            throw new RuntimeException(e);
        } catch (IllegalAccessException e) {
            throw new RuntimeException(e);
        }

        return runMlt(doc);
    }

    @Override
    public List<TSetBasedCategorisationResult> testCategoriseSingle(String docReference) {
        logger.info(".testCategoriseSingle: docreference:{} ", docReference);
        return testCategoriseSingle((iaViewRepository.searchDocByDocReference(docReference)));
    }

    @Override
    public List<TSetBasedCategorisationResult> categoriseSingle(String docReference) {
        // TODO TSETBASED Auto-generated method stub
        return null;
    }

    public List<TSetBasedCategorisationResult> categoriseSingle(InformationAssetView iaView) {
        // TODO TSETBASED Auto-generated method stub
        return null;
    }

    @Override
    public IAViewUpdate findLastIAViewUpdate() {
        // TODO TSETBASED Auto-generated method stub
        return null;
    }

    @Override
    public void refreshTaxonomyIndex() {
        // TODO TSETBASED Auto-generated method stub
    }

    @Override
    public List<TSetBasedCategorisationResult> categoriseSingle(String docReference,
            List<CategoryWithLuceneQuery> cachedCategories) {
        // TODO TSETBASED Auto-generated method stub
        return null;
    }

    @Override
    public List<IAViewUpdate> getNewCategorisedDocumentsFromDateToNSecondsInPast(Date date, int nbOfSeconds,
            int limit) {
        // TODO TSETBASED Auto-generated method stub
        return null;
    }

    @Override
    public List<IAViewUpdate> getNewCategorisedDocumentsAfterDocumentAndUpToNSecondsInPast(
            IAViewUpdate afterIAViewUpdate, int nbOfSeconds, int limit) {
        // TODO TSETBASED Auto-generated method stub
        return null;
    }

}