be.fedict.dcat.scrapers.XlsPsiBelgium.java Source code

Java tutorial

Introduction

Here is the source code for be.fedict.dcat.scrapers.XlsPsiBelgium.java

Source

/*
 * Copyright (c) 2015, Bart Hanssens <bart.hanssens@fedict.be>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
package be.fedict.dcat.scrapers;

import be.fedict.dcat.helpers.Storage;
import be.fedict.dcat.vocab.DCAT;
import be.fedict.dcat.vocab.MDR_LANG;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;
import org.apache.poi.ss.usermodel.Row;
import org.openrdf.model.IRI;
import org.openrdf.model.vocabulary.DCTERMS;
import org.openrdf.model.vocabulary.FOAF;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.repository.RepositoryException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Scraper for PSI Belgium Excel export.
 * 
 * @author Bart Hanssens <bart.hanssens@fedict.be>
 */
public class XlsPsiBelgium extends Xls {
    private final Logger logger = LoggerFactory.getLogger(XlsPsiBelgium.class);

    public final static String ID = "contentid";

    public final static String ACCESS = "questionformnameurl_url";
    public final static String ACCESS2 = "publicinfo_siteurl";
    public final static String CREATED = "publishstartdate";
    public final static String DESC = "formshortdsc_dsc";
    public final static String DESC2 = "formcomment_comment";
    public final static String DESC3 = "datescomment_comment";
    public final static String DESC4 = "licensecomment_comment";
    public final static String DESC5 = "reusablebylicence_reglobasisname";
    public final static String DOWNLOAD = "forminformationurl_url";
    public final static String FEE = "feerequired";
    public final static String FMT1 = "efomrat_imageformatlabel";
    public final static String FMT2 = "eformat_vectorelformatlabel";
    public final static String FMT3 = "eformat_pageformatlabel";
    public final static String FREQID = "idfrequencytype_fk";
    public final static String KEYWORD = "searchoninfo_";
    public final static String ORGID = "idinstitudiont_fk";
    public final static String PFROM = "dispdate";
    public final static String PTILL = "lastrevdate";
    public final static String REUSE = "reusablebylicence";
    public final static String RIGHTS = "obtentionurl_url";
    public final static String TITLE = "formtitle";

    public final static DateFormat DATEFMT = new SimpleDateFormat("yyyy-MM-dd");

    @Override
    protected URL getId(Row row) throws MalformedURLException {
        String s = row.getCell(0).toString();
        return makeDatasetURL(stringInt(s));
    }

    /**
     * Get date from field
     * 
     * @param map
     * @param field 
     * @return date or null
     */
    protected Date getDate(Map<String, String> map, String field) {
        Date date = null;
        String s = map.getOrDefault(field, "");
        if (!s.isEmpty()) {
            try {
                date = XlsPsiBelgium.DATEFMT.parse(s);
            } catch (ParseException ex) {
                logger.warn("Could not parse {} to date", s);
            }
        }
        return date;
    }

    /**
     * Get array of keywords, based on last part of search field.
     * 
     * @param map
     * @param lang language code
     * @return 
     */
    protected String[] getKeywords(Map<String, String> map, String lang) {
        String s = map.getOrDefault(XlsPsiBelgium.KEYWORD + lang, "");
        // Keywords are added after the last sentence of the description
        int pos = s.lastIndexOf('.');
        if (pos > 0) {
            s = s.substring(pos + 1);
            return s.replaceAll("\n", "").split(",");
        }
        return (new String[0]);
    }

    /**
     * Get array of formats
     * 
     * @param map
     * @return string array of formats
     */
    private String[] getFormats(Map<String, String> map) {
        String fmt1 = map.getOrDefault(XlsPsiBelgium.FMT1, "");
        String fmt2 = map.getOrDefault(XlsPsiBelgium.FMT2, "");
        String fmt3 = map.getOrDefault(XlsPsiBelgium.FMT3, "");

        String fmt = fmt1 + "," + fmt2 + "," + fmt3;
        return fmt.toLowerCase().replace(";", ",").replace(".", "").split(",");
    }

    /**
     * Add organization / publisher
     * 
     * @param store
     * @param dataset
     * @param s
     * @throws MalformedURLException
     * @throws RepositoryException 
     */
    private void generateOrg(Storage store, IRI dataset, Map<String, String> map)
            throws MalformedURLException, RepositoryException {
        String s = map.getOrDefault(XlsPsiBelgium.ORGID, "");
        IRI org = store.getURI(makeOrgURL(stringInt(s)).toString());
        store.add(dataset, DCTERMS.PUBLISHER, org);
        store.add(org, RDF.TYPE, FOAF.ORGANIZATION);
    }

    /**
     * Generate DCAT Distribution.
     * 
     * @param store
     * @param dataset
     * @param map
     * @param lang
     * @throws RepositoryException 
     */
    private void generateDist(Storage store, IRI dataset, Map<String, String> map, String id, String lang)
            throws RepositoryException, MalformedURLException {
        URL u = makeDistURL(id + "/" + lang);
        IRI dist = store.getURI(u.toString());
        logger.debug("Generating distribution {}", dist.toString());

        String access = map.getOrDefault(XlsPsiBelgium.ACCESS + lang, "");
        String access2 = map.getOrDefault(XlsPsiBelgium.ACCESS2 + lang, "");

        String rights = map.getOrDefault(XlsPsiBelgium.RIGHTS + lang, "");
        String download = map.getOrDefault(XlsPsiBelgium.DOWNLOAD + lang, "");

        String[] fmts = getFormats(map);
        for (String fmt : fmts) {
            if (!fmt.trim().equals("")) {
                store.add(dist, DCAT.MEDIA_TYPE, fmt.trim());
            }
        }

        String fee = stringInt(map.get(XlsPsiBelgium.FEE));
        String reuse = stringInt(map.get(XlsPsiBelgium.REUSE));
        boolean open = reuse.equals("1") && fee.equals("0");

        store.add(dataset, DCAT.DISTRIBUTION, dist);
        store.add(dist, RDF.TYPE, DCAT.A_DISTRIBUTION);
        store.add(dist, DCTERMS.LANGUAGE, MDR_LANG.MAP.get(lang));
        if (!access.isEmpty()) {
            store.add(dist, DCAT.ACCESS_URL, new URL(access));
        }
        if (!access2.isEmpty()) {
            store.add(dist, DCAT.ACCESS_URL, new URL(access2));
        }
        if (!download.isEmpty()) {
            store.add(dist, DCAT.DOWNLOAD_URL, new URL(download));
            store.add(dist, DCAT.MEDIA_TYPE, getFileExt(download));
        }
        store.add(dist, DCTERMS.LICENSE, open ? "open" : "closed");
        if (!rights.isEmpty()) {
            store.add(dist, DCTERMS.RIGHTS, new URL(rights));
        }
    }

    /**
     * Generate DCAT Dataset.
     * 
     * @param store
     * @param map 
     * @param u
     * @throws RepositoryException
     * @throws MalformedURLException
     */
    @Override
    public void generateDataset(Storage store, Map<String, String> map, URL u)
            throws RepositoryException, MalformedURLException {
        IRI dataset = store.getURI(u.toString());
        logger.debug("Generating dataset {}", dataset.toString());

        String id = stringInt(map.get(XlsPsiBelgium.ID));
        String freq = stringInt(map.getOrDefault(XlsPsiBelgium.FREQID, "0"));

        store.add(dataset, RDF.TYPE, DCAT.A_DATASET);
        store.add(dataset, DCTERMS.IDENTIFIER, makeHashId(u.toString()));

        Date created = getDate(map, XlsPsiBelgium.CREATED);
        if (created != null) {
            store.add(dataset, DCTERMS.CREATED, created);
        }

        store.add(dataset, DCTERMS.ACCRUAL_PERIODICITY, freq);

        String[] langs = getAllLangs();
        for (String lang : langs) {
            String title = map.getOrDefault(XlsPsiBelgium.TITLE + lang, "");

            String desc = map.getOrDefault(XlsPsiBelgium.DESC + lang, title);
            String desc2 = map.getOrDefault(XlsPsiBelgium.DESC2 + lang, "");
            if (!desc2.isEmpty()) {
                desc = desc + "\n\n" + desc2;
            }
            String desc3 = map.getOrDefault(XlsPsiBelgium.DESC3 + lang, "");
            if (!desc3.isEmpty()) {
                desc = desc + "\n\n" + desc3;
            }
            String desc4 = map.getOrDefault(XlsPsiBelgium.DESC4 + lang, "");
            if (!desc4.isEmpty()) {
                desc = desc + "\n\n" + desc4;
            }
            String desc5 = map.getOrDefault(XlsPsiBelgium.DESC5 + lang, "");
            if (!desc5.isEmpty()) {
                desc = desc + "\n\n" + desc5;
            }

            store.add(dataset, DCTERMS.LANGUAGE, MDR_LANG.MAP.get(lang));
            store.add(dataset, DCTERMS.TITLE, title, lang);
            store.add(dataset, DCTERMS.DESCRIPTION, desc, lang);

            String from = stringInt(map.getOrDefault(XlsPsiBelgium.PFROM, ""));
            String till = stringInt(map.getOrDefault(XlsPsiBelgium.PTILL, ""));
            String period = from + " / " + till;
            if (!period.equals(" / ")) {
                store.add(dataset, DCTERMS.TEMPORAL, period);
            }

            String[] words = getKeywords(map, lang);
            for (String word : words) {
                store.add(dataset, DCAT.KEYWORD, word.trim(), lang);
            }

            generateDist(store, dataset, map, id, lang);
        }
        generateOrg(store, dataset, map);
    }

    /**
     * Constructor.
     * 
     * @param caching
     * @param storage
     * @param base 
     */
    public XlsPsiBelgium(File caching, File storage, URL base) {
        super(caching, storage, base);
        setName("psibelgium");
    }
}