sk.opendata.odn.harvester.datanest.OrganizationsDatanestHarvester.java Source code

Java tutorial

Introduction

Here is the source code for sk.opendata.odn.harvester.datanest.OrganizationsDatanestHarvester.java

Source

/* Copyright (C) 2011 Peter Hanecak <hanecak@opendata.sk>
 *
 * This file is part of Open Data Node.
 *
 * Open Data Node is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Open Data Node is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Open Data Node.  If not, see <http://www.gnu.org/licenses/>.
 */

package sk.opendata.odn.harvester.datanest;

import java.io.IOException;
import java.text.ParseException;
import java.util.Date;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerConfigurationException;

import org.apache.commons.lang3.StringEscapeUtils;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.config.RepositoryConfigException;
import org.quartz.Job;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import sk.opendata.odn.model.OrganizationRecord;
import sk.opendata.odn.repository.sesame.SesameRepository;
import sk.opendata.odn.repository.solr.SolrRepository;
import sk.opendata.odn.serialization.rdf.OrganizationRdfSerializer;
import sk.opendata.odn.serialization.solr.SolrSerializer;

/**
 * This class contains stuff related to scraper of Register Organizacii
 * published by Aliancia Fair-Play at http://datanest.fair-play.sk/datasets/1 .
 */
public class OrganizationsDatanestHarvester extends AbstractDatanestHarvester<OrganizationRecord> implements Job {

    public final static String KEY_DATANEST_ORGANIZATIONS_URL_KEY = "datanest.organizations.url";

    protected final static int ATTR_INDEX_ID = 0;
    protected final static int ATTR_INDEX_NAME = 2;
    protected final static int ATTR_INDEX_SEAT = 4;
    protected final static int ATTR_INDEX_LEGAL_FORM = 5;
    protected final static int ATTR_INDEX_ICO = 3;
    protected final static int ATTR_INDEX_DATE_FROM = 7;
    protected final static int ATTR_INDEX_DATE_TO = 8;
    protected final static int ATTR_INDEX_SOURCE = 14;

    private static Logger logger = LoggerFactory.getLogger(OrganizationsDatanestHarvester.class);

    public OrganizationsDatanestHarvester() throws IOException, RepositoryConfigException, RepositoryException,
            ParserConfigurationException, TransformerConfigurationException {

        super(KEY_DATANEST_ORGANIZATIONS_URL_KEY);

        OrganizationRdfSerializer rdfSerializer = new OrganizationRdfSerializer(SesameRepository.getInstance());
        addSerializer(rdfSerializer);

        SolrSerializer<OrganizationRecord> solrSerializer = new SolrSerializer<OrganizationRecord>(
                SolrRepository.getInstance());
        addSerializer(solrSerializer);
    }

    @Override
    public OrganizationRecord scrapOneRecord(String[] row) throws ParseException {
        OrganizationRecord record = new OrganizationRecord();

        record.setId("org_" + row[ATTR_INDEX_ID]);
        record.setDatanestId(row[ATTR_INDEX_ID]);
        record.setSource(row[ATTR_INDEX_SOURCE]);
        record.setName(StringEscapeUtils.escapeXml(row[ATTR_INDEX_NAME]));
        record.setLegalForm(row[ATTR_INDEX_LEGAL_FORM]);
        record.setSeat(row[ATTR_INDEX_SEAT]);
        record.setIco(row[ATTR_INDEX_ICO]);

        Date dateFrom = sdf.parse(row[ATTR_INDEX_DATE_FROM]);
        record.setDateFrom(dateFrom);

        if (!row[ATTR_INDEX_DATE_TO].isEmpty()) {
            Date dateTo = sdf.parse(row[ATTR_INDEX_DATE_TO]);
            record.setDateTo(dateTo);
        }

        logger.debug("scrapped record of: " + record.getName());

        return record;
    }

}