de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetLoader.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetLoader.java

Source

/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.tudarmstadt.ukp.dkpro.core.api.datasets;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.codec.binary.Hex;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.commons.io.output.NullOutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.ud.UDDataset;

@Deprecated
public class DatasetLoader {
    private final Log LOG = LogFactory.getLog(getClass());

    private File cacheRoot;

    public DatasetLoader() {
    }

    public DatasetLoader(File aCacheRoot) {
        setCacheRoot(aCacheRoot);
    }

    public void setCacheRoot(File aCacheRoot) {
        cacheRoot = aCacheRoot;
    }

    public File getCacheRoot() {
        return cacheRoot;
    }

    public List<Dataset> loadUniversalDependencyTreebankV1_3() throws IOException {
        File dataDir = new File(cacheRoot, "ud-treebanks-v1.3");

        DataPackage data = new DataPackage.Builder()
                .url("https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/"
                        + "1-1699/ud-treebanks-v1.3.tgz?sequence=1&isAllowed=y")
                .sha1("44367112880cf0af3f293cb3f0cc6ce50c0e65c0").target("ud-treebanks-v1.3.tgz")
                .postAction((d) -> {
                    untgz(new File(dataDir, d.getTarget()), dataDir);
                }).build();

        fetch(dataDir, data);

        List<Dataset> sets = new ArrayList<>();
        for (File f : new File(dataDir, "ud-treebanks-v1.3").listFiles()) {
            sets.add(new UDDataset(f));
        }
        return sets;
    }

    private void fetch(File aTarget, DataPackage... aPackages) throws IOException {
        // First validate if local copies are still up-to-date
        boolean reload = false;
        packageValidationLoop: for (DataPackage pack : aPackages) {
            File cachedFile = new File(aTarget, pack.getTarget());
            if (!cachedFile.exists()) {
                continue;
            }

            if (pack.getSha1() != null) {
                String actual = getDigest(cachedFile, "SHA1");
                if (!pack.getSha1().equals(actual)) {
                    LOG.info("Local SHA1 hash mismatch on [" + cachedFile + "] - expected [" + pack.getSha1()
                            + "] - actual [" + actual + "]");
                    reload = true;
                    break packageValidationLoop;
                } else {
                    LOG.info("Local SHA1 hash verified on [" + cachedFile + "] - [" + actual + "]");
                }
            }

            if (pack.getMd5() != null) {
                String actual = getDigest(cachedFile, "MD5");
                if (!pack.getMd5().equals(actual)) {
                    LOG.info("Local MD5 hash mismatch on [" + cachedFile + "] - expected [" + pack.getMd5()
                            + "] - actual [" + actual + "]");
                    reload = true;
                    break packageValidationLoop;
                } else {
                    LOG.info("Local MD5 hash verified on [" + cachedFile + "] - [" + actual + "]");
                }
            }

        }

        // If any of the packages are outdated, clear the cache and download again
        if (reload) {
            LOG.info("Clearing local cache for [" + aTarget + "]");
            FileUtils.deleteQuietly(aTarget);
        }

        for (DataPackage pack : aPackages) {
            File cachedFile = new File(aTarget, pack.getTarget());

            if (cachedFile.exists()) {
                continue;
            }

            MessageDigest md5;
            try {
                md5 = MessageDigest.getInstance("MD5");
            } catch (NoSuchAlgorithmException e) {
                throw new IOException(e);
            }

            MessageDigest sha1;
            try {
                sha1 = MessageDigest.getInstance("SHA1");
            } catch (NoSuchAlgorithmException e) {
                throw new IOException(e);
            }

            cachedFile.getParentFile().mkdirs();
            URL source = new URL(pack.getUrl());

            LOG.info("Fetching [" + cachedFile + "]");

            URLConnection connection = source.openConnection();
            connection.setRequestProperty("User-Agent", "Java");

            try (InputStream is = connection.getInputStream()) {
                DigestInputStream md5Filter = new DigestInputStream(is, md5);
                DigestInputStream sha1Filter = new DigestInputStream(md5Filter, sha1);
                FileUtils.copyInputStreamToFile(sha1Filter, cachedFile);

                if (pack.getMd5() != null) {
                    String md5Hex = new String(Hex.encodeHex(md5Filter.getMessageDigest().digest()));
                    if (!pack.getMd5().equals(md5Hex)) {
                        String message = "MD5 mismatch. Expected [" + pack.getMd5() + "] but got [" + md5Hex + "].";
                        LOG.error(message);
                        throw new IOException(message);
                    }
                }

                if (pack.getSha1() != null) {
                    String sha1Hex = new String(Hex.encodeHex(sha1Filter.getMessageDigest().digest()));
                    if (!pack.getSha1().equals(sha1Hex)) {
                        String message = "SHA1 mismatch. Expected [" + pack.getSha1() + "] but got [" + sha1Hex
                                + "].";
                        LOG.error(message);
                        throw new IOException(message);
                    }
                }
            }
        }

        // Perform a post-fetch action such as unpacking
        for (DataPackage pack : aPackages) {
            File cachedFile = new File(aTarget, pack.getTarget());
            File postActionCompleteMarker = new File(cachedFile.getPath() + ".postComplete");
            if (pack.getPostAction() != null && !postActionCompleteMarker.exists()) {
                try {
                    pack.getPostAction().run(pack);
                    FileUtils.touch(postActionCompleteMarker);
                } catch (IOException e) {
                    throw e;
                } catch (Exception e) {
                    throw new IllegalStateException(e);
                }
            }
        }
    }

    private String getDigest(File aFile, String aDigest) throws IOException {
        MessageDigest digest;
        try {
            digest = MessageDigest.getInstance(aDigest);
        } catch (NoSuchAlgorithmException e) {
            throw new IOException(e);
        }
        try (InputStream is = new FileInputStream(aFile)) {
            DigestInputStream digestFilter = new DigestInputStream(is, digest);
            IOUtils.copy(digestFilter, new NullOutputStream());
            return new String(Hex.encodeHex(digestFilter.getMessageDigest().digest()));
        }
    }

    private void untgz(File aArchive, File aTarget) throws IOException {
        try (ArchiveInputStream archive = new TarArchiveInputStream(
                new GzipCompressorInputStream(new BufferedInputStream(new FileInputStream(aArchive))))) {
            extract(aArchive, archive, aTarget);
        }
    }

    private void extract(File aArchive, ArchiveInputStream aArchiveStream, File aTarget) throws IOException {
        ArchiveEntry entry = null;
        while ((entry = aArchiveStream.getNextEntry()) != null) {
            String name = entry.getName();

            // Ensure that the filename will not break the manifest
            if (name.contains("\n")) {
                throw new IllegalStateException("Filename must not contain line break");
            }

            File out = new File(aTarget, name);
            if (entry.isDirectory()) {
                FileUtils.forceMkdir(out);
            } else {
                FileUtils.copyInputStreamToFile(new CloseShieldInputStream(aArchiveStream), out);
            }
        }
    }
}