FeatureExtraction.FeatureExtractorOOXMLStructuralPathsInputStream.java Source code

Java tutorial

Introduction

Here is the source code for FeatureExtraction.FeatureExtractorOOXMLStructuralPathsInputStream.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package FeatureExtraction;

import IO.Console;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/**
 *
 * @author Aviad
 */
public class FeatureExtractorOOXMLStructuralPathsInputStream<T> extends AFeatureExtractor<T> {

    private final long serialVersionUID = 1L;
    private final boolean m_ignoreNumbersInFeatures;

    public FeatureExtractorOOXMLStructuralPathsInputStream(boolean ignoreNumbersInFeatures) {
        m_ignoreNumbersInFeatures = ignoreNumbersInFeatures;
    }

    @Override
    public Map<String, Integer> ExtractFeaturesFrequencyFromSingleElement(T element) {
        Map<String, Integer> structuralPaths = new HashMap<>();
        InputStream fileInputStream = (InputStream) element;
        ExtractStructuralFeaturesInMemory(fileInputStream, structuralPaths);
        return structuralPaths;
    }

    private void ExtractStructuralFeaturesInMemory(InputStream fileInputStream,
            Map<String, Integer> structuralPaths) {
        String path;
        String directoryPath;
        String fileExtension;
        try {
            File file = new File(fileInputStream.hashCode() + "");
            try (FileOutputStream fos = new FileOutputStream(file)) {
                IOUtils.copy(fileInputStream, fos);
            }

            ZipFile zipFile = new ZipFile(file);

            for (Enumeration e = zipFile.entries(); e.hasMoreElements();) {
                ZipEntry entry = (ZipEntry) e.nextElement();
                path = entry.getName();
                directoryPath = FilenameUtils.getFullPath(path);
                fileExtension = FilenameUtils.getExtension(path);

                AddStructuralPath(directoryPath, structuralPaths);
                AddStructuralPath(path, structuralPaths);

                if (fileExtension.equals("rels") || fileExtension.equals("xml")) {
                    AddXMLStructuralPaths(zipFile.getInputStream(entry), path, structuralPaths);
                }
            }
        } catch (IOException ex) {
            Console.PrintException(String.format("Error extracting OOXML structural features from file in memory"),
                    ex);
        }
    }

    /**
     * Add structural paths from the given xml file into the local map
     *
     * @param xmlFilePath the path of a xml file
     * @param structuralPaths the Map to add the extracted features to
     */
    private void AddXMLStructuralPaths(InputStream xmlFilePath, String path, Map<String, Integer> structuralPaths) {
        try {
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder db = dbf.newDocumentBuilder();
            Document xml = db.parse(xmlFilePath);
            NodeList nodeList = xml.getChildNodes();
            for (int i = 0; i < nodeList.getLength(); i++) {
                AddXMLStructuralPathsRecursively(nodeList.item(i), path, structuralPaths);
            }
        } catch (Exception ex) {
            //Console.Print_To_Console(String.format("Error traversing XML file: '%s'", xmlFilePath), true, false);
        }
    }

    /**
     * Add structural paths from the given xml file into the local map
     * recursively
     *
     * @param xmlNode xmlNode to look for its childs
     * @param parentNodePath the path of the parent node
     * @param structuralPaths the Map to add the extracted features to
     */
    private void AddXMLStructuralPathsRecursively(Node xmlNode, String parentNodePath,
            Map<String, Integer> structuralPaths) {
        String currentNodePath = String.format("%s\\%s", parentNodePath, xmlNode.getNodeName());
        AddStructuralPath(currentNodePath, structuralPaths);

        NodeList childNodes = xmlNode.getChildNodes();
        Node childNode;
        for (int i = 0; i < childNodes.getLength(); i++) {
            childNode = childNodes.item(i);
            AddXMLStructuralPathsRecursively(childNode, currentNodePath, structuralPaths);
        }
    }

    /**
     * Add structural path to local Map
     *
     * @param structuralPath the key to add to the map
     * @param structuralPaths the Map to add the feature to
     */
    private void AddStructuralPath(String structuralPath, Map<String, Integer> structuralPaths) {
        structuralPath = structuralPath.replace("/", "\\");
        if (m_ignoreNumbersInFeatures) {
            structuralPath = structuralPath.replaceAll("[0-9]", "");
        }

        if (!structuralPath.equals("")) {
            if (!structuralPaths.containsKey(structuralPath)) {
                structuralPaths.put(structuralPath, 1);
            } else {
                structuralPaths.put(structuralPath, structuralPaths.get(structuralPath) + 1);
            }
        }
    }

    @Override
    public String GetName() {
        if (m_ignoreNumbersInFeatures) {
            return "OOXML Structural Paths NN";
        } else {
            return "OOXML Structural Paths";
        }
    }

}