pl.edu.icm.cermine.metadata.extraction.enhancers.AuthorTitleSplitterEnhancer.java Source code

Java tutorial

Introduction

Here is the source code for pl.edu.icm.cermine.metadata.extraction.enhancers.AuthorTitleSplitterEnhancer.java

Source

/**
 * This file is part of CERMINE project.
 * Copyright (c) 2011-2013 ICM-UW
 *
 * CERMINE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CERMINE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with CERMINE. If not, see <http://www.gnu.org/licenses/>.
 */

package pl.edu.icm.cermine.metadata.extraction.enhancers;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.metadata.model.DocumentMetadata;
import pl.edu.icm.cermine.structure.HierarchicalReadingOrderResolver;
import pl.edu.icm.cermine.structure.ReadingOrderResolver;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.structure.model.BxLine;
import pl.edu.icm.cermine.structure.model.BxZone;
import pl.edu.icm.cermine.structure.model.BxZoneLabel;
import pl.edu.icm.cermine.structure.tools.BxBoundsBuilder;

/**
 * Author enhancer.
 *
 * This enhancer should be invoked after affiliation enhancer.
 *
 * @author krusek
 * @author Dominika Tkaczyk
 */
public class AuthorTitleSplitterEnhancer extends AbstractSimpleEnhancer {

    public AuthorTitleSplitterEnhancer() {
        setSearchedZoneLabels(EnumSet.of(BxZoneLabel.MET_AUTHOR));
    }

    @Override
    protected boolean enhanceMetadata(BxDocument document, DocumentMetadata metadata) {
        for (BxZone zone : document.getFirstChild()) {
            if (BxZoneLabel.MET_TITLE.equals(zone.getLabel())) {
                return false;
            }
        }

        ReadingOrderResolver roResolver = new HierarchicalReadingOrderResolver();

        BxZone toDel = null;
        BxZone toAdd1 = null;
        BxZone toAdd2 = null;

        for (BxZone zone : filterZones(document.getFirstChild())) {
            BxZone z1 = new BxZone();
            z1.setLabel(BxZoneLabel.MET_TITLE);
            BxBoundsBuilder b1 = new BxBoundsBuilder();
            BxZone z2 = new BxZone();
            z2.setLabel(BxZoneLabel.MET_AUTHOR);
            BxBoundsBuilder b2 = new BxBoundsBuilder();
            boolean wasAuthor = false;
            BxLine prev = null;
            for (BxLine line : zone) {
                if (prev != null && Sets.intersection(prev.getFontNames(), line.getFontNames()).isEmpty()) {
                    String[] words = line.toText().split(" ");
                    int cWordsCount = 0;
                    int wordsCount = 0;
                    for (String s : words) {
                        if (s.matches(".*[a-zA-Z].*")) {
                            wordsCount++;
                        }
                        if (s.matches("[A-Z].*")) {
                            cWordsCount++;
                        }
                    }
                    if (line.toText().contains(",") && (double) cWordsCount / (double) wordsCount > 0.7) {
                        wasAuthor = true;
                    }
                }
                if (wasAuthor) {
                    z2.addLine(line);
                    b2.expand(line.getBounds());
                } else {
                    z1.addLine(line);
                    b1.expand(line.getBounds());
                }
                prev = line;
            }
            z1.setBounds(b1.getBounds());
            z2.setBounds(b2.getBounds());
            if (z1.hasChildren() && z2.hasChildren()) {
                toDel = zone;
                toAdd1 = z1;
                toAdd2 = z2;
            }
        }

        if (toDel != null) {
            List<BxZone> list = new ArrayList<BxZone>();
            list.addAll(Lists.newArrayList(document.getFirstChild()));
            list.remove(toDel);
            list.add(toAdd1);
            list.add(toAdd2);
            document.getFirstChild().setZones(list);
            try {
                roResolver.resolve(document);
            } catch (AnalysisException ex) {
            }
        }

        return false;
    }

    @Override
    protected Set<EnhancedField> getEnhancedFields() {
        return EnumSet.noneOf(EnhancedField.class);
    }

}