org.metaservice.core.deb.util.GitCache.java Source code

Java tutorial

Introduction

Here is the source code for org.metaservice.core.deb.util.GitCache.java

Source

/*
 * Copyright 2015 Nikola Ilo
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.metaservice.core.deb.util;

import org.apache.commons.io.IOUtils;
import org.jetbrains.annotations.NotNull;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.metaservice.core.utils.GitUtil;
import org.metaservice.core.utils.MetaserviceHttpClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.*;
import java.util.zip.GZIPInputStream;

public class GitCache {
    private static Logger LOGGER = LoggerFactory.getLogger(GitCache.class);

    public static void main(String[] args) throws IOException, InterruptedException, GitUtil.GitException {
        if (args.length == 0 || "debian".equals(args[0])) {
            startString = "http://snapshot.debian.org/archive/debian/20050312T000000Z/dists/";
            workdir = new File("/opt/crawlertest/data/");
            licenses = new String[] { "main", "contrib", "non-free" };
            dryRun = false;
        } else if ("security".equals(args[0])) {
            startString = "http://snapshot.debian.org/archive/debian-security/20050313T000000Z/dists/";
            workdir = new File("/opt/crawlertest/data_security/");
            licenses = new String[] { "updates/main", "updates/contrib", "updates/non-free" };
            dryRun = false;
        } else if ("backports".equals(args[0])) {
            startString = "http://snapshot.debian.org/archive/debian-backports/20090114T125110Z/dists/";
            workdir = new File("/opt/crawlertest/data_backports/");
            licenses = new String[] { "main", "contrib", "non-free" };
            dryRun = false;
        } else if ("ports".equals(args[0])) {
            startString = "http://snapshot.debian.org/archive/debian-ports/20081002T000000Z/dists/";
            workdir = new File("/opt/crawlertest/data_ports/");
            licenses = new String[] { "main", "contrib", "non-free" };
            dryRun = false;
        } else if ("volatile".equals(args[0])) {
            startString = "http://snapshot.debian.org/archive/debian-volatile/20050313T000000Z/dists/";
            workdir = new File("/opt/crawlertest/data_volatile/");
            licenses = new String[] { "volatile/main", "volatile/contrib", "volatile/non-free" };
            dryRun = false;
        } else {
            LOGGER.error("NOT FOUND " + args[0]);
            return;
        }
        /* if(workdir.exists())             {
        LOGGER.error("WORKDIR {} EXISTS",workdir);
        return;
         }
         if(!dryRun) {
        workdir.mkdirs();
            
         } */
        GitCache git = new GitCache();
        git.runDiscovery();
        git.createDirectory();
    }

    // Paramters
    private static boolean dryRun = true;
    private static String startString;
    private static File workdir;
    private static String[] archs = { "amd64", "armel", "armhf", "i386", "ia64", "mipsel", "mips", "powerpc",
            "s390x", "s390", "sparc", "alpha", "arm", "AVR32", "hppa", "m32", "m68k", "sh", "ppc64", "x32" };
    private static String[] licenses;

    private GitUtil gitUtil;

    public GitCache() throws GitUtil.GitException {
        if (!dryRun)
            gitUtil = new GitUtil(workdir);
    }

    private void createDirectory() throws IOException, InterruptedException, GitUtil.GitException {
        int gcCounter = 0;
        /* if(!dryRun)
        gitUtil.initRepository();
          */
        for (long time : getAccessOrder()) {
            if (time <= 20110505052259l)
                continue;

            LOGGER.info("time: {}", toTimeString(time));
            for (String uri : processingMap.get(time)) {
                if (uri.startsWith("http://snapshot.debian.org/archive/debian/20110505T101429Z"))
                    continue;

                File f = getResultPath(uri);
                f.getParentFile().mkdirs();
                f.delete();
                if (!dryRun) {
                    byte[] data = clientMetaservice.getBinary(uri);
                    if (data == null) {
                        LOGGER.error("could not retrieve " + uri + " -> SKIPPING");
                        continue;
                    }
                    InputStream fileStream = new ByteArrayInputStream(data);
                    InputStream gzipStream = new GZIPInputStream(fileStream);
                    FileOutputStream outputStream = new FileOutputStream(f);
                    IOUtils.copy(gzipStream, outputStream);
                    gzipStream.close();
                    outputStream.close();
                    gzipStream.close();
                } else {
                    if (!clientMetaservice.isCached(uri)) {
                        byte[] data = clientMetaservice.getBinary(uri);
                        if (data == null) {
                            LOGGER.error("could not retrieve " + uri + " -> SKIPPING");
                            continue;
                        }
                    } else {
                        LOGGER.info("IS IN CACHE {}", uri);
                    }
                }

                LOGGER.info("retrieving {} to {}", uri, getResultPath(uri));
            }

            if (!dryRun && gitUtil.hasChangesToCommit()) {
                LOGGER.info("workingcopy dirty - will commit");
                gitUtil.commitChanges(toTimeString(time));
                if (gcCounter++ == 50) {
                    gitUtil.gc();

                    gcCounter = 0;
                }
            } else {
                LOGGER.info("workingcopy clean - will not commit");
            }
        }
    }

    @NotNull
    private File getResultPath(@NotNull String url) {
        return new File(url.replaceFirst("http://snapshot.debian.org/archive/[^/]*/........T......Z",
                workdir.getAbsolutePath()).replace(".gz", ""));
    }

    @NotNull
    MetaserviceHttpClient clientMetaservice = new MetaserviceHttpClient();

    int i = 0;

    public void runDiscovery() {
        HashSet<String> parsed = new HashSet<>();
        LinkedList<String> toParse = new LinkedList<>();
        HashSet<String> dists = new HashSet<>();
        toParse.add(startString);
        while (toParse.size() > 0) {
            String uri = toParse.pop();
            try {
                String s = clientMetaservice.get(uri);
                if (s == null) {
                    LOGGER.error("Couldn't load " + uri + " skipping.");
                    continue;
                }
                Document document = Jsoup.parse(s, uri);
                parsed.add(uri);
                for (Element e : document.select("a:contains(next change)")) {
                    String href = e.attr("abs:href");
                    if (!parsed.contains(href) && !toParse.contains(href)) {
                        LOGGER.info("adding (next) ", href);
                        toParse.push(href);
                    }
                }

                for (Element e : document.select("a[href$=/]")) {
                    String absHref = e.attr("abs:href");
                    String href = e.attr("href");
                    if (!dists.contains(href) && !href.startsWith("/")
                            && !href.startsWith(".") /* &&!toParse.contains (href) */) {
                        if (uri.endsWith("dists/") /*&& !href.contains("sid") && !href.contains("experimental")*/) {
                            dists.add(href);
                            LOGGER.info(href);
                            for (String license : licenses) {
                                String url = absHref + license + "/";
                                LOGGER.info("adding (lic) {}", url);
                                toParse.add(url);
                            }
                        }
                        for (String license : licenses) {
                            if (uri.endsWith(license + "/")) {
                                if (href.startsWith("binary-")) {
                                    for (String arch : archs) {
                                        if (href.contains(arch)) {
                                            LOGGER.info("adding (archdir) {}", absHref);
                                            toParse.add(absHref);
                                        }
                                    }
                                }
                                if (href.startsWith("source")) {
                                    LOGGER.info("adding (archdir) {}", absHref);
                                    toParse.add(absHref);
                                }
                            }
                        }

                    }

                }

                for (Element e : document.select("a[abs:href$=Packages.gz] , a[abs:href$=Sources.gz]")) {
                    String href = e.attr("abs:href");
                    //only if this seems to be a non duplicate
                    if (document.select("a:contains(prev change)").size() == 0
                            || document.select("a:contains(prev change)").get(0).attr("abs:href").equals(document
                                    .select("a:contains(prev):not(:contains(change))").get(0).attr("abs:href"))) {
                        LOGGER.info("RESULT processing ... {} {} ", i++, href);
                        processFileToParse(href);
                    }
                }
            } catch (RuntimeException exception) {
                LOGGER.error("RUNTIME EXCEPTION ", exception);
                throw exception;
            }
        }
    }

    @NotNull
    HashMap<Long, Set<String>> processingMap = new HashMap<>();

    private void processFileToParse(@NotNull String href) {
        long timeConstant = getTimeConstant(href);
        if (!processingMap.containsKey(timeConstant)) {
            processingMap.put(timeConstant, new HashSet<String>());
        }
        processingMap.get(timeConstant).add(href);
    }

    @NotNull
    private List<Long> getAccessOrder() {
        ArrayList<Long> list = new ArrayList<>();
        list.addAll(processingMap.keySet());
        Collections.sort(list);
        return list;
    }

    private long getTimeConstant(@NotNull String href) {
        int start = href.indexOf("/2") + 1; ///ATTENTION does only work for 2xxx
        return Long.valueOf(href.substring(start, start + 16).replace("T", "").replace("Z", ""));
    }

    private String toTimeString(long time) {
        return Long.toString(time).substring(0, 8) + "T" + Long.toString(time).substring(8);
    }
}