com.penguineering.cleanuri.sites.reichelt.ReicheltExtractor.java Source code

Introduction

Here is the source code for com.penguineering.cleanuri.sites.reichelt.ReicheltExtractor.java
Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.penguineering.cleanuri.sites.reichelt;

import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang3.StringEscapeUtils;

import com.penguineering.cleanuri.api.Extractor;
import com.penguineering.cleanuri.api.ExtractorException;
import com.penguineering.cleanuri.api.Metakey;

/**
 * Meta-data extractor for Reichelt catalog data.
 * 
 * @author Tux (tux@netz39.de)
 * 
 */
public class ReicheltExtractor implements Extractor {
    public ReicheltExtractor() {

    }

    @Override
    public Map<Metakey, String> extractMetadata(URI uri) throws ExtractorException {
        if (uri == null)
            throw new NullPointerException("URI argument must not be null!");

        URL url;
        try {
            url = uri.toURL();
        } catch (MalformedURLException e) {
            throw new IllegalArgumentException("The provided URI is not a URL!");
        }

        Map<Metakey, String> meta = new HashMap<Metakey, String>();

        try {
            final URLConnection con = url.openConnection();

            LineNumberReader reader = null;
            try {
                reader = new LineNumberReader(new InputStreamReader(con.getInputStream()));

                String line;
                while ((line = reader.readLine()) != null) {
                    if (!line.contains("<h2>"))
                        continue;

                    // h2
                    int h2_idx = line.indexOf("h2");
                    // Doppelpunkte
                    int col_idx = line.indexOf("<span> :: <span");
                    final String art_id = line.substring(h2_idx + 3, col_idx);
                    meta.put(Metakey.ID, html2oUTF8(art_id).trim());

                    int span_idx = line.indexOf("</span>");
                    final String art_name = line.substring(col_idx + 32, span_idx);
                    meta.put(Metakey.NAME, html2oUTF8(art_name).trim());

                    break;
                }

                return meta;
            } finally {
                if (reader != null)
                    reader.close();
            }
        } catch (IOException e) {
            throw new ExtractorException("I/O exception during extraction: " + e.getMessage(), e, uri);
        }

    }

    private static String html2oUTF8(String html) throws UnsupportedEncodingException {
        final String iso = StringEscapeUtils.unescapeHtml4(html);

        final byte[] b = iso.getBytes("ISO-8859-15");
        return new String(b, "ISO-8859-15");
    }

}