uk.gov.nationalarchives.discovery.taxonomy.common.service.impl.EvaluationServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for uk.gov.nationalarchives.discovery.taxonomy.common.service.impl.EvaluationServiceImpl.java

Source

/** 
 * Copyright (c) 2015, The National Archives
 * http://www.nationalarchives.gov.uk 
 * 
 * This Source Code Form is subject to the terms of the Mozilla Public 
 * License, v. 2.0. If a copy of the MPL was not distributed with this 
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */
package uk.gov.nationalarchives.discovery.taxonomy.common.service.impl;

import uk.gov.nationalarchives.discovery.taxonomy.common.domain.exception.TaxonomyException;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.repository.lucene.InformationAssetViewFields;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.repository.mongo.Category;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.repository.mongo.CategoryEvaluationResult;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.repository.mongo.EvaluationReport;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.repository.mongo.TestDocument;
import uk.gov.nationalarchives.discovery.taxonomy.common.domain.service.CategorisationResult;
import uk.gov.nationalarchives.discovery.taxonomy.common.repository.legacy.LegacySystemRepository;
import uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.IAViewRepository;
import uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.tools.LuceneTaxonomyMapper;
import uk.gov.nationalarchives.discovery.taxonomy.common.repository.mongo.CategoryRepository;
import uk.gov.nationalarchives.discovery.taxonomy.common.repository.mongo.EvaluationReportRepository;
import uk.gov.nationalarchives.discovery.taxonomy.common.repository.mongo.TestDocumentRepository;
import uk.gov.nationalarchives.discovery.taxonomy.common.service.CategoriserService;
import uk.gov.nationalarchives.discovery.taxonomy.common.service.EvaluationService;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.lucene.document.Document;
import org.apache.lucene.search.TopDocs;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;

@SuppressWarnings({ "rawtypes", "unchecked" })
@Service
public class EvaluationServiceImpl implements EvaluationService {

    private static final Logger logger = LoggerFactory.getLogger(EvaluationServiceImpl.class);

    private final CategoryRepository categoryRepository;

    private final IAViewRepository iaviewRepository;

    private final TestDocumentRepository testDocumentRepository;

    private final EvaluationReportRepository evaluationReportRepository;

    private final LegacySystemRepository legacySystemService;

    private final CategoriserService categoriserService;

    @Autowired
    public EvaluationServiceImpl(CategoryRepository categoryRepository, IAViewRepository iaviewRepository,
            TestDocumentRepository testDocumentRepository, EvaluationReportRepository evaluationReportRepository,
            LegacySystemRepository legacySystemService, CategoriserService categoriserService) {
        super();
        this.categoryRepository = categoryRepository;
        this.iaviewRepository = iaviewRepository;
        this.testDocumentRepository = testDocumentRepository;
        this.evaluationReportRepository = evaluationReportRepository;
        this.legacySystemService = legacySystemService;
        this.categoriserService = categoriserService;
    }

    /*
     * (non-Javadoc)
     * 
     * @see uk.gov.nationalarchives.discovery.taxonomy.common.service.impl.
     * EvaluationService# createEvaluationTestDataset()
     */
    @Override
    public void createEvaluationTestDataset(int pMinNbOfElementsPerCat) {
        logger.info(".createEvaluationTestDataset> empty testDocument collection");
        testDocumentRepository.deleteAll();

        logger.info(".createEvaluationTestDataset> going to retrieve {} documents per category at least",
                pMinNbOfElementsPerCat);

        for (Category category : categoryRepository.findAll()) {
            logger.info(".createEvaluationTestDataset: processing category: {}", category.getTtl());

            int nbOfMatchedElementsWithLegacySystem = 0;
            Integer page = 1;
            while (nbOfMatchedElementsWithLegacySystem < pMinNbOfElementsPerCat) {
                Map<String, String[]> mapOfLegacyDocuments = legacySystemService
                        .findLegacyDocumentsByCategory(category.getTtl(), page);
                int currentSuccessfulAttempts = 0;

                if (mapOfLegacyDocuments == null || mapOfLegacyDocuments.size() == 0) {
                    logger.warn(
                            ".createTestDataset: giving up, no results found on legacy system or no document found in IAView repo for category: {}",
                            category.getTtl());
                    break;
                }

                for (Entry<String, String[]> entryOfCategoriesForInformationAssetId : mapOfLegacyDocuments
                        .entrySet()) {
                    TopDocs topDocs = iaviewRepository.searchIAViewIndexByFieldAndPhrase(
                            InformationAssetViewFields.DOCREFERENCE.toString(),
                            entryOfCategoriesForInformationAssetId.getKey(), 1);

                    if (topDocs.totalHits != 0) {
                        Document doc = iaviewRepository.getDoc(topDocs.scoreDocs[0]);
                        TestDocument testDocument = LuceneTaxonomyMapper.getTestDocumentFromLuceneDocument(doc);
                        testDocument.setLegacyCategories(entryOfCategoriesForInformationAssetId.getValue());
                        testDocumentRepository.save(testDocument);
                        nbOfMatchedElementsWithLegacySystem++;
                        currentSuccessfulAttempts++;
                    }

                    if (nbOfMatchedElementsWithLegacySystem == pMinNbOfElementsPerCat) {
                        break;
                    }
                }
                page += 1;
                if (currentSuccessfulAttempts == 0) {
                    logger.warn(
                            ".createTestDataset: giving up, no results found on legacy system or no document found in IAView repo among 15 last previous attempts for category: {}",
                            category.getTtl());
                    break;
                }
            }

        }
        logger.info(".createEvaluationTestDataset < END");
    }

    /*
     * (non-Javadoc)
     * 
     * @see uk.gov.nationalarchives.discovery.taxonomy.common.service.impl.
     * EvaluationService# evaluateCategorisation()
     */
    @Override
    public void runCategorisationOnTestDataSet(Boolean matchNbOfReturnedCategories) {
        logger.info(".runCategorisationOnTestDataSet> START");
        logger.info(".runCategorisationOnTestDataSet: processing {} documents",
                String.valueOf(testDocumentRepository.count()));
        for (TestDocument testDocument : testDocumentRepository.findAll()) {
            List<CategorisationResult> categorisationResults;
            try {
                categorisationResults = categoriserService.testCategoriseSingle(testDocument.getDocReference());
            } catch (TaxonomyException e) {
                logger.error(
                        ".runCategorisationOnTestDataSet: an error occured while getting categories for doc: {}",
                        testDocument.getDocReference(), e);
                continue;
            }
            List<String> categories = new ArrayList<String>();
            for (int i = 0; i < categorisationResults.size(); i++) {
                if (matchNbOfReturnedCategories && i >= testDocument.getLegacyCategories().length) {
                    break;
                }

                CategorisationResult categorisationResult = categorisationResults.get(i);
                categories.add(categorisationResult.getName());

            }
            testDocument.setCategories(categories.toArray(new String[0]));
            testDocumentRepository.save(testDocument);
        }
        logger.info(".runCategorisationOnTestDataSet < END");
    }

    /*
     * (non-Javadoc)
     * 
     * @see uk.gov.nationalarchives.discovery.taxonomy.common.service.impl.
     * EvaluationService# getEvaluationReport()
     */
    @Override
    public EvaluationReport getEvaluationReport(String comments) {
        logger.info(".getEvaluationReport > START");
        Map<String, Integer> mapOfTruePositivesPerCat = new HashMap<String, Integer>();
        Map<String, Integer> mapOfFalsePositivesPerCat = new HashMap<String, Integer>();
        Map<String, Integer> mapOfFalseNegativesPerCat = new HashMap<String, Integer>();
        long numberOfDocuments = testDocumentRepository.count();
        for (TestDocument testDocument : testDocumentRepository.findAll()) {
            List<String> categories = CollectionUtils.arrayToList(testDocument.getCategories());
            List<String> legacyCategories = CollectionUtils.arrayToList(testDocument.getLegacyCategories());

            for (String category : categories) {
                if (legacyCategories.contains(category)) {
                    incrementMapValueForCategory(mapOfTruePositivesPerCat, category);
                } else {
                    incrementMapValueForCategory(mapOfFalsePositivesPerCat, category);
                }
            }

            for (String legacyCategory : legacyCategories) {
                if (!categories.contains(legacyCategory)) {
                    incrementMapValueForCategory(mapOfFalseNegativesPerCat, legacyCategory);
                }
            }

        }

        Set<String> setOfCategories = new HashSet<String>();
        setOfCategories.addAll(mapOfTruePositivesPerCat.keySet());
        setOfCategories.addAll(mapOfFalsePositivesPerCat.keySet());
        setOfCategories.addAll(mapOfFalseNegativesPerCat.keySet());

        List<CategoryEvaluationResult> listOfEvaluationResults = new ArrayList<CategoryEvaluationResult>();
        for (String category : setOfCategories) {
            Integer tp = mapOfTruePositivesPerCat.get(category);
            Integer fp = mapOfFalsePositivesPerCat.get(category);
            Integer fn = mapOfFalseNegativesPerCat.get(category);
            listOfEvaluationResults
                    .add(new CategoryEvaluationResult(category, tp != null ? tp : 0, fp != null ? fp : 0,
                            fn != null ? fn : 0, categoryRepository.findByTtl(category) != null ? true : false));
        }
        for (Category category : categoryRepository.findAll()) {
            String categoryName = category.getTtl();
            if (!setOfCategories.contains(categoryName)) {
                listOfEvaluationResults.add(new CategoryEvaluationResult(categoryName, true, false, false));
            }
        }

        EvaluationReport report = new EvaluationReport(comments, listOfEvaluationResults, (int) numberOfDocuments);
        evaluationReportRepository.save(report);
        logger.info(".getEvaluationReport < END");
        return report;
    }

    private void incrementMapValueForCategory(Map<String, Integer> mapOfOccurencesPerCat, String category) {
        Integer existingCounter = mapOfOccurencesPerCat.get(category);
        mapOfOccurencesPerCat.put(category, (existingCounter != null) ? (existingCounter + 1) : 1);
    }

}