ru.ksu.niimm.cll.mocassin.crawl.parser.latex.StructureBuilderImpl.java Source code

Java tutorial

Introduction

Here is the source code for ru.ksu.niimm.cll.mocassin.crawl.parser.latex.StructureBuilderImpl.java

Source

/*******************************************************************************
 * Copyright (c) 2010-2012 Nikita Zhiltsov.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl.html
 * 
 * Contributors:
 *     Nikita Zhiltsov - initial API and implementation
 *     Azat Khasanshin - implementation
 ******************************************************************************/
package ru.ksu.niimm.cll.mocassin.crawl.parser.latex;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Stack;

import net.sourceforge.texlipse.model.DocumentReference;
import net.sourceforge.texlipse.model.OutlineNode;

import org.slf4j.Logger;

import ru.ksu.niimm.cll.mocassin.crawl.parser.impl.EdgeContextImpl;
import ru.ksu.niimm.cll.mocassin.crawl.parser.impl.EdgeImpl;
import ru.ksu.niimm.cll.mocassin.crawl.parser.impl.NodeImpl;
import ru.ksu.niimm.cll.mocassin.crawl.parser.pdf.Latex2PDFMapper;
import ru.ksu.niimm.cll.mocassin.util.inject.log.InjectLogger;

import com.google.common.collect.Iterables;
import com.google.inject.Inject;

import edu.uci.ics.jung.graph.DirectedSparseMultigraph;
import edu.uci.ics.jung.graph.Graph;

/**
 * This class implements building a graph with labels/references and containment
 * relations as its edges.
 * 
 * @author Nikita Zhiltsov
 * 
 */
class StructureBuilderImpl implements StructureBuilder {
    private static final String NODE_ID_FORMAT = "%d_%d";
    @InjectLogger
    private Logger logger;
    private final Latex2PDFMapper latex2pdfMapper;

    @Inject
    private StructureBuilderImpl(Latex2PDFMapper latex2pdfMapper) {
        this.latex2pdfMapper = latex2pdfMapper;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public Graph<Node, Edge> buildStructureGraph(LatexDocumentModel parsedModel) {
        Graph<Node, Edge> hypergraph = new DirectedSparseMultigraph<Node, Edge>();
        if (parsedModel == null) {
            logger.warn("The parsed model is null. An empty graph will be returned");
            return hypergraph;
        }

        Stack<OutlineNode> stack = new Stack<OutlineNode>();
        List<OutlineNode> tree = parsedModel.getTree();
        OutlineNode documentRoot = parsedModel.getDocumentRoot();
        for (int i = tree.size() - 1; i >= 0; i--) {
            OutlineNode treeItem = tree.get(i);
            stack.push(treeItem);

            String documentNodeId = String.format(NODE_ID_FORMAT, documentRoot.getBeginLine(),
                    documentRoot.getOffsetOnLine());
            String documentNodeName = documentRoot.getName();
            Node documentRootNode = new NodeImpl.Builder(documentNodeId, documentNodeName)
                    .beginLine(documentRoot.getBeginLine()).endLine(documentRoot.getEndLine())
                    .offset(documentRoot.getOffsetOnLine()).isEnvironment(false).numbered(false).build();
            addEdge(hypergraph, documentRootNode, treeItem, EdgeType.CONTAINS, parsedModel);
        }
        while (!stack.isEmpty()) {
            OutlineNode node = stack.pop();

            ArrayList<OutlineNode> children = node.getChildren();

            if (children != null) {
                String nodeId = String.format(NODE_ID_FORMAT, node.getBeginLine(), node.getOffsetOnLine());
                String nodeTitle = extractTitle(node, parsedModel);
                boolean isNumbered = getNumberedProperty(node, parsedModel);
                Node from = new NodeImpl.Builder(nodeId, extractName(node, parsedModel))
                        .beginLine(node.getBeginLine()).endLine(node.getEndLine()).offset(node.getOffsetOnLine())
                        .isEnvironment(node.getType() == OutlineNode.TYPE_ENVIRONMENT).title(nodeTitle)
                        .numbered(isNumbered).build();
                for (OutlineNode child : children) {
                    if (child.getType() == OutlineNode.TYPE_LABEL) {
                        from.setLabelText(child.getName());
                        List<DocumentReference> references = getReferencesForLabel(child, parsedModel);
                        addReferenceEdges(hypergraph, references, from, parsedModel);
                    } else {
                        addEdge(hypergraph, from, child, EdgeType.CONTAINS, parsedModel);
                        stack.add(child);
                    }

                }
            }
        }
        fillPageNumbers(hypergraph, parsedModel);
        return hypergraph;
    }

    private void fillPageNumbers(Graph<Node, Edge> hypergraph, LatexDocumentModel model) {
        Collection<Node> nodes = hypergraph.getVertices();

        for (Node node : nodes) {
            int pageNumber = 0;
            for (int l = node.getBeginLine(); l <= node.getEndLine(); l++) {
                pageNumber = latex2pdfMapper.getPDFPageNumber(l, model.getDocId());
                if (pageNumber > 0)
                    break;
            }
            node.setPdfPageNumber(pageNumber);
        }
    }

    private void addEdge(Graph<Node, Edge> hypergraph, Node from, OutlineNode toNode, EdgeType edgeType,
            LatexDocumentModel model) {
        Edge edge = new EdgeImpl();
        String childId = String.format(NODE_ID_FORMAT, toNode.getBeginLine(), toNode.getOffsetOnLine());
        String labelText = getLabelText(toNode);
        String nodeName = extractName(toNode, model);
        String nodeTitle = extractTitle(toNode, model);
        boolean isNumbered = getNumberedProperty(toNode, model);
        Node to = new NodeImpl.Builder(childId, nodeName).beginLine(toNode.getBeginLine())
                .endLine(toNode.getEndLine()).offset(toNode.getOffsetOnLine())
                .isEnvironment(toNode.getType() == OutlineNode.TYPE_ENVIRONMENT).labelText(labelText)
                .title(nodeTitle).numbered(isNumbered).build();
        EdgeContext context = new EdgeContextImpl(edgeType);
        edge.setContext(context);
        addEdge(hypergraph, edge, from, to);
    }

    private String extractName(OutlineNode node, LatexDocumentModel model) {
        String nodeName;
        switch (node.getType()) {
        case OutlineNode.TYPE_SECTION:
            nodeName = "section";
            break;
        case OutlineNode.TYPE_SUBSECTION:
            nodeName = "subsection";
            break;
        case OutlineNode.TYPE_SUBSUBSECTION:
            nodeName = "subsubsection";
            break;
        default: {
            nodeName = node.getName();
            NewtheoremCommand foundCommand = Iterables.find(model.getNewtheorems(),
                    new NewtheoremCommand.KeyPredicate(nodeName), null);
            if (foundCommand != null) {
                nodeName = foundCommand.getTitle();
            }
        }
        }
        return nodeName;
    }

    private String extractTitle(OutlineNode node, LatexDocumentModel model) {
        int nodeType = node.getType();
        String nodeName = node.getName();
        String nodeTitle = null;
        if (nodeType == OutlineNode.TYPE_SECTION || nodeType == OutlineNode.TYPE_SUBSECTION
                || nodeType == OutlineNode.TYPE_SUBSUBSECTION) {

            nodeTitle = nodeName;
        } else {
            NewtheoremCommand foundCommand = Iterables.find(model.getNewtheorems(),
                    new NewtheoremCommand.KeyPredicate(nodeName), null);
            if (foundCommand != null) {
                nodeTitle = foundCommand.getTitle();
            } else {
                nodeTitle = nodeName;
            }
        }
        return nodeTitle;
    }

    private boolean getNumberedProperty(OutlineNode node, LatexDocumentModel model) {
        int nodeType = node.getType();
        if (nodeType == OutlineNode.TYPE_SECTION || nodeType == OutlineNode.TYPE_SUBSECTION
                || nodeType == OutlineNode.TYPE_SUBSUBSECTION)
            return true; // TODO: not accurate!!

        String nodeName = node.getName();
        if (nodeName.endsWith("*"))
            return false;
        NewtheoremCommand foundCommand = Iterables.find(model.getNewtheorems(),
                new NewtheoremCommand.KeyPredicate(nodeName), null);
        if (foundCommand != null) {
            return foundCommand.isNumbered();
        }
        return true;
    }

    private void addInverseEdge(Graph<Node, Edge> hypergraph, OutlineNode fromNode, Node to, EdgeType edgeType,
            LatexDocumentModel model) {
        Edge edge = new EdgeImpl();
        String childId = String.format(NODE_ID_FORMAT, fromNode.getBeginLine(), fromNode.getOffsetOnLine());
        String nodeName = extractName(fromNode, model);
        String labelText = getLabelText(fromNode);
        String nodeTitle = extractTitle(fromNode, model);
        boolean isNumbered = getNumberedProperty(fromNode, model);
        Node from = new NodeImpl.Builder(childId, nodeName).beginLine(fromNode.getBeginLine())
                .endLine(fromNode.getEndLine()).offset(fromNode.getOffsetOnLine())
                .isEnvironment(fromNode.getType() == OutlineNode.TYPE_ENVIRONMENT).labelText(labelText)
                .title(nodeTitle).numbered(isNumbered).build();
        EdgeContext context = new EdgeContextImpl(edgeType);
        edge.setContext(context);
        addEdge(hypergraph, edge, from, to);
    }

    private void addEdge(Graph<Node, Edge> hypergraph, Edge edge, final Node from, final Node to) {
        Node foundFrom = null;
        Node foundTo = null;
        if (hypergraph.containsVertex(from)) {
            foundFrom = findVertice(hypergraph, from);
        }
        if (hypergraph.containsVertex(to)) {
            foundTo = findVertice(hypergraph, to);
        }
        hypergraph.addEdge(edge, foundFrom != null ? foundFrom : from, foundTo != null ? foundTo : to);
    }

    private Node findVertice(Graph<Node, Edge> hypergraph, Node node) {
        Collection<Node> vertices = hypergraph.getVertices();
        for (Node cur : vertices) {
            if (cur.equals(node)) {
                return cur;
            }
        }
        throw new RuntimeException("node not found: " + node);
    }

    /**
     * get label text of given node
     * 
     * @param fromNode
     * @return
     */
    private String getLabelText(OutlineNode fromNode) {
        if (fromNode.getChildren() != null) {
            for (OutlineNode child : fromNode.getChildren()) {
                if (child.getType() == OutlineNode.TYPE_LABEL) {
                    return child.getName();
                }
            }
        }
        return null;
    }

    private void addReferenceEdges(Graph<Node, Edge> hypergraph, List<DocumentReference> references, Node to,
            LatexDocumentModel model) {
        for (DocumentReference reference : references) {
            OutlineNode parent = getReferenceParent(reference, model);
            if (parent != null) {
                addInverseEdge(hypergraph, parent, to, EdgeType.REFERS_TO, model);
            }
        }
    }

    /**
     * depth-first traversal using 'beginLine' and 'endLine' info to follow the
     * path
     * 
     * @param reference
     * @return
     */
    private OutlineNode getReferenceParent(DocumentReference reference, LatexDocumentModel model) {
        Stack<OutlineNode> stack = new Stack<OutlineNode>();

        for (OutlineNode root : model.getTree()) {
            if (reference.startLine >= root.getBeginLine() && reference.startLine <= root.getEndLine()) {
                stack.push(root);
                break;
            }
        }

        while (!stack.isEmpty()) {
            OutlineNode node = stack.pop();
            ArrayList<OutlineNode> children = node.getChildren();
            if (children == null) {
                return node;
            } else {
                boolean foundChild = false;
                for (OutlineNode child : children) {
                    if (reference.startLine >= child.getBeginLine() && reference.startLine <= child.getEndLine()) {
                        stack.push(child);
                        foundChild = true;
                        break;
                    }
                }
                if (!foundChild) {
                    return node;
                }
            }

        }

        OutlineNode documentRoot = model.getDocumentRoot();
        if (reference.startLine >= documentRoot.getBeginLine()
                && reference.startLine <= documentRoot.getEndLine()) {
            return documentRoot;
        }
        return null; // then the reference is outer
    }

    private List<DocumentReference> getReferencesForLabel(OutlineNode child, LatexDocumentModel model) {

        PdfReferenceEntry label = getLabel(child, model);
        List<DocumentReference> refs = new ArrayList<DocumentReference>();
        Iterator<DocumentReference> iterator = model.getReferences().iterator();
        while (iterator.hasNext()) {
            DocumentReference ref = iterator.next();
            if (ref.getKey().equals(label.key())) {
                refs.add(ref);
            } else if (!refs.isEmpty()) {
                break; // 'cause references list is ordered by 'key'
            }
        }

        return refs;
    }

    private PdfReferenceEntry getLabel(OutlineNode child, LatexDocumentModel model) {
        PdfReferenceEntry foundLabel = null;
        int i = 0;
        while (i <= model.getLabels().size() - 1) {
            PdfReferenceEntry label = model.getLabels().get(i);
            boolean found = label.key().equals(child.getName()) && label.startLine() == child.getBeginLine();
            if (found) {
                foundLabel = label;
            }
            i++;
        }
        return foundLabel;
    }

}