org.apache.nutch.api.impl.db.DbPageConverter.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.api.impl.db.DbPageConverter.java

Source

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.apache.nutch.api.impl.db;

import java.nio.ByteBuffer;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.avro.Schema.Field;
import org.apache.avro.util.Utf8;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.nutch.parse.ParseStatusUtils;
import org.apache.nutch.protocol.ProtocolStatusUtils;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.StringUtil;

import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

public class DbPageConverter {

    public static Map<String, Object> convertPage(WebPage page, Set<String> fields) {
        Map<String, Object> result = Maps.newHashMap();
        for (Field field : filterFields(page, fields)) {
            Object value = convertField(page, field);
            if (value != null) {
                result.put(field.name(), value);
            }
        }
        return result;
    }

    private static Object convertField(WebPage page, Field field) {
        int index = field.pos();
        if (index < 0) {
            return null;
        }

        Object value = page.get(index);
        if (value == null) {
            return null;
        }

        String fieldName = field.name();
        if (StringUtils.equals(fieldName, "metadata")) {
            return getSimpleMetadata(page);
        }
        if (StringUtils.equals(fieldName, "protocolStatus")) {
            return ProtocolStatusUtils.toString(page.getProtocolStatus());
        }
        if (StringUtils.equals(fieldName, "parseStatus")) {
            return ParseStatusUtils.toString(page.getParseStatus());
        }
        if (StringUtils.equals(fieldName, "signature")) {
            return StringUtil.toHexString(page.getSignature());
        }
        if (StringUtils.equals(fieldName, "content")) {
            return Bytes.toStringBinary(page.getContent());
        }
        if (StringUtils.equals(fieldName, "markers")) {
            return convertToStringsMap(page.getMarkers());
        }
        if (StringUtils.equals(fieldName, "inlinks")) {
            return convertToStringsMap(page.getInlinks());
        }
        if (StringUtils.equals(fieldName, "outlinks")) {
            return convertToStringsMap(page.getOutlinks());
        }

        if (value instanceof Utf8) {
            return value.toString();
        }

        if (value instanceof ByteBuffer) {
            return Bytes.toStringBinary((ByteBuffer) value);
        }

        return value;
    }

    private static Set<Field> filterFields(WebPage page, Set<String> queryFields) {
        List<Field> pageFields = page.getSchema().getFields();
        if (CollectionUtils.isEmpty(queryFields)) {
            return Sets.newHashSet(pageFields);
        }

        Set<Field> filteredFields = Sets.newLinkedHashSet();
        for (Field field : pageFields) {
            if (queryFields.contains(field.name())) {
                filteredFields.add(field);
            }
        }
        return filteredFields;
    }

    private static Map<String, String> getSimpleMetadata(WebPage page) {
        Map<CharSequence, ByteBuffer> metadata = page.getMetadata();
        if (MapUtils.isEmpty(metadata)) {
            return Collections.emptyMap();
        }
        Map<String, String> simpleMeta = Maps.newHashMap();
        for (CharSequence key : metadata.keySet()) {
            simpleMeta.put(key.toString(), Bytes.toStringBinary(metadata.get(key)));
        }
        return simpleMeta;
    }

    private static Map<String, String> convertToStringsMap(Map<?, ?> map) {
        Map<String, String> res = Maps.newHashMap();
        for (Entry<?, ?> entry : map.entrySet()) {
            res.put(entry.getKey().toString(), entry.getValue().toString());
        }
        return res;
    }
}