de.jetwick.snacktory.AbstractPageReader.java Source code

Java tutorial

Introduction

Here is the source code for de.jetwick.snacktory.AbstractPageReader.java

Source

package de.jetwick.snacktory;
/******************************************************************************
 * Copyright (c) 2010 Basis Technology Corp.
 * 
 * Basis Technology Corp. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.commons.io.IOUtils;

public class AbstractPageReader {
    static final Logger LOG = LoggerFactory.getLogger(HttpPageReader.class);
    static final Charset UTF8 = Charset.forName("utf-8");

    private PageCharsetDetector charsetDetector;
    private Charset charset;
    private boolean serverReturnedEncoding;
    private boolean respectServerEncoding;
    private String detectedEncoding;

    protected String readContent(InputStream response, String forceEncoding) throws IOException {
        byte[] bytes = IOUtils.toByteArray(response);
        charset = null;
        String hint = null;
        if (forceEncoding != null) {
            serverReturnedEncoding = true;
            try {
                charset = Charset.forName(forceEncoding);
                hint = charset.name();
            } catch (Exception e) {
                //
            }
        }
        if (charsetDetector != null && (!respectServerEncoding || charset == null)) {
            String charsetName = charsetDetector.detect(bytes, hint);
            if (charsetName != null) {
                try {
                    charset = Charset.forName(charsetName);
                    detectedEncoding = charset.name();
                } catch (Exception e) {
                    LOG.warn("Detected character set " + charsetName + " not supported");
                }
            }
        }
        if (charset == null) {
            LOG.warn("Defaulting to utf-8");
            charset = UTF8;
        }
        return new String(bytes, charset);
    }

    public PageCharsetDetector getCharsetDetector() {
        return charsetDetector;
    }

    public void setCharsetDetector(PageCharsetDetector charsetDetector) {
        this.charsetDetector = charsetDetector;
    }

    public Charset getCharset() {
        return charset;
    }

    public boolean isServerReturnedEncoding() {
        return serverReturnedEncoding;
    }

    public void setRespectServerEncoding(boolean respectServerEncoding) {
        this.respectServerEncoding = respectServerEncoding;
    }

    public boolean isRespectServerEncoding() {
        return respectServerEncoding;
    }

    public String getDetectedEncoding() {
        return detectedEncoding;
    }

}