pl.edu.icm.coansys.deduplication.document.DuplicateWorkDetectReduceService.java Source code

Introduction

Here is the source code for pl.edu.icm.coansys.deduplication.document.DuplicateWorkDetectReduceService.java
Source

/*
 * This file is part of CoAnSys project.
 * Copyright (c) 2012-2013 ICM-UW
 * 
 * CoAnSys is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
    
 * CoAnSys is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with CoAnSys. If not, see <http://www.gnu.org/licenses/>.
 */

package pl.edu.icm.coansys.deduplication.document;

import pl.edu.icm.coansys.commons.java.DocumentWrapperUtils;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import pl.edu.icm.coansys.commons.spring.DiReduceService;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.hadoop.conf.Configuration;
import org.springframework.beans.factory.annotation.Value;
import pl.edu.icm.coansys.deduplication.document.keygenerator.WorkKeyGenerator;
import pl.edu.icm.coansys.models.DocumentProtos;

/**
 * 
 * @author ?ukasz Dumiszewski
 *
 */

@Service("duplicateWorkDetectReduceService")
public class DuplicateWorkDetectReduceService implements DiReduceService<Text, BytesWritable, Text, Text> {

    //@SuppressWarnings("unused")
    private static Logger log = LoggerFactory.getLogger(DuplicateWorkDetectReduceService.class);

    @Autowired
    private DuplicateWorkService duplicateWorkService;

    @Autowired
    private WorkKeyGenerator keyGen;

    private int initialMaxDocsSetSize;
    private int maxDocsSetSizeInc;
    private int maxSplitLevel;

    //******************** DiReduceService Implementation ********************

    @Override
    public void reduce(Text key, Iterable<BytesWritable> values,
            Reducer<Text, BytesWritable, Text, Text>.Context context) throws IOException, InterruptedException {

        log.info("starting reduce, key: " + key.toString());

        List<DocumentProtos.DocumentMetadata> documents = DocumentWrapperUtils.extractDocumentMetadata(key, values);

        long startTime = new Date().getTime();

        Configuration conf = context.getConfiguration();
        initialMaxDocsSetSize = conf.getInt("INITIAL_MAX_DOCS_SET_SIZE", initialMaxDocsSetSize);
        maxDocsSetSizeInc = conf.getInt("MAX_DOCS_SET_SIZE_INC", maxDocsSetSizeInc);
        maxSplitLevel = conf.getInt("MAX_SPLIT_LEVEL", maxSplitLevel);

        process(key, context, documents, 0, initialMaxDocsSetSize);

        log.info("time [msec]: " + (new Date().getTime() - startTime));

    }

    //******************** PRIVATE ********************

    /**
     * Processes the given documents: finds duplicates and saves them to the context under the same, common key.
     * If the number of the passed documents is greater than the <b>maxNumberOfDocuments</b> then the documents are split into smaller parts and
     * then the method is recursively invoked for each one of those parts.
     * @param key a common key of the documents
     * @param level the recursive depth of the method used to generate a proper key of the passed documents. The greater the level the longer (and more unique) the
     * generated key.
     */
    void process(Text key, Reducer<Text, BytesWritable, Text, Text>.Context context,
            List<DocumentProtos.DocumentMetadata> documents, int level, int maxNumberOfDocuments)
            throws IOException, InterruptedException {
        String dashes = getDashes(level);
        log.info(dashes + "start process, key: {}, number of documents: {}", key.toString(), documents.size());
        if (documents.size() < 2) {
            log.info(dashes + "one document only, ommiting");
            return;
        }

        int lev = level + 1;
        int maxNumOfDocs = maxNumberOfDocuments;

        if (documents.size() > maxNumOfDocs) {
            Map<Text, List<DocumentProtos.DocumentMetadata>> documentPacks = splitDocuments(key, documents, lev);
            log.info(dashes + "documents split into: {} packs", documentPacks.size());

            for (Map.Entry<Text, List<DocumentProtos.DocumentMetadata>> docs : documentPacks.entrySet()) {
                if (docs.getValue().size() == documents.size()) { // docs were not splitted, the generated key is the same for all the titles, may happen if the documents have the same short title, e.g. news in brief
                    maxNumOfDocs += maxDocsSetSizeInc;
                }
                process(docs.getKey(), context, docs.getValue(), lev, maxNumOfDocs);
            }

        } else {
            if (isDebugMode(context.getConfiguration())) {
                duplicateWorkService.findDuplicates(documents, context);
            } else {
                Map<Integer, Set<DocumentProtos.DocumentMetadata>> duplicateWorksMap = duplicateWorkService
                        .findDuplicates(documents, null);
                saveDuplicatesToContext(duplicateWorksMap, key, context);
            }
            context.progress();
        }
        log.info(dashes + "end process, key: {}", key);

    }

    private String getDashes(int level) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i <= level; i++) {
            sb.append("-");
        }
        return sb.toString();
    }

    private boolean isDebugMode(Configuration conf) {
        if (conf == null) {
            return false;
        }
        String debugOptionValue = conf.get("DEDUPLICATION_DEBUG_MODE", "false");
        return debugOptionValue.equals("true");
    }

    /**
     * Splits the passed documents into smaller parts. The documents are divided into smaller packs according to the generated keys.
     * The keys are generated by using the {@link WorkKeyGenerator.generateKey(doc, level)} method.
     */
    Map<Text, List<DocumentProtos.DocumentMetadata>> splitDocuments(Text key,
            List<DocumentProtos.DocumentMetadata> documents, int level) {

        // check if set was forced to split; if yes, keep the suffix
        String keyStr = key.toString();
        String suffix = "";
        if (keyStr.contains("-")) {
            String[] parts = keyStr.split("-");
            suffix = parts[1];
        }

        Map<Text, List<DocumentProtos.DocumentMetadata>> splitDocuments = Maps.newHashMap();
        for (DocumentProtos.DocumentMetadata doc : documents) {
            String newKeyStr = keyGen.generateKey(doc, level);
            if (!suffix.isEmpty()) {
                newKeyStr = newKeyStr + "-" + suffix;
            }
            Text newKey = new Text(newKeyStr);
            List<DocumentProtos.DocumentMetadata> list = splitDocuments.get(newKey);
            if (list == null) {
                list = Lists.newArrayList();
                splitDocuments.put(newKey, list);
            }
            list.add(doc);
        }

        if (level > maxSplitLevel && splitDocuments.size() == 1) {
            //force split into 2 parts
            Text commonKey = splitDocuments.keySet().iterator().next();
            String commonKeyStr = commonKey.toString();
            if (!commonKeyStr.contains("-")) {
                commonKeyStr += "-";
            }
            Text firstKey = new Text(commonKeyStr + "0");
            Text secondKey = new Text(commonKeyStr + "1");
            List<DocumentProtos.DocumentMetadata> fullList = splitDocuments.get(commonKey);
            int items = fullList.size();
            List<DocumentProtos.DocumentMetadata> firstHalf = fullList.subList(0, items / 2);
            List<DocumentProtos.DocumentMetadata> secondHalf = fullList.subList(items / 2, items);
            splitDocuments.clear();
            splitDocuments.put(firstKey, firstHalf);
            splitDocuments.put(secondKey, secondHalf);
        }

        return splitDocuments;
    }

    private void saveDuplicatesToContext(Map<Integer, Set<DocumentProtos.DocumentMetadata>> sameWorksMap, Text key,
            Reducer<Text, BytesWritable, Text, Text>.Context context) throws IOException, InterruptedException {

        for (Map.Entry<Integer, Set<DocumentProtos.DocumentMetadata>> entry : sameWorksMap.entrySet()) {
            String sameWorksKey = key.toString() + "_" + entry.getKey();
            for (DocumentProtos.DocumentMetadata doc : entry.getValue()) {
                context.write(new Text(sameWorksKey), new Text(doc.getKey()));
            }
        }

    }

    @Value("1000")
    public void setBeginPackSize(int beginPackSize) {
        this.initialMaxDocsSetSize = beginPackSize;
    }

    @Value("200")
    public void setPackSizeInc(int packSizeInc) {
        this.maxDocsSetSizeInc = packSizeInc;
    }

    @Value("10")
    public void setMaxSplitLevels(int maxSplitLevels) {
        this.maxSplitLevel = maxSplitLevels;
    }

    static enum UnparseableIssue {
        UNPARSEABLE
    };
}