com.digitalpebble.stormcrawler.aws.bolt.CloudSearchUtils.java Source code

Introduction

Here is the source code for com.digitalpebble.stormcrawler.aws.bolt.CloudSearchUtils.java
Source

/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.stormcrawler.aws.bolt;

import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.regex.Pattern;

import org.apache.commons.codec.binary.Hex;

public class CloudSearchUtils {

    private static MessageDigest digester;

    private static final Pattern INVALID_XML_CHARS = Pattern
            .compile("[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD]");

    static {
        try {
            digester = MessageDigest.getInstance("SHA-512");
        } catch (NoSuchAlgorithmException e) {
            throw new RuntimeException(e);
        }
    }

    private CloudSearchUtils() {
    }

    /** Returns a normalised doc ID based on the URL of a document **/
    public static String getID(String url) {

        // the document needs an ID
        // see
        // http://docs.aws.amazon.com/cloudsearch/latest/developerguide/preparing-data.html#creating-document-batches
        // A unique ID for the document. A document ID can contain any
        // letter or number and the following characters: _ - = # ; : / ? @
        // &. Document IDs must be at least 1 and no more than 128
        // characters long.
        byte[] dig = digester.digest(url.getBytes(StandardCharsets.UTF_8));
        String ID = Hex.encodeHexString(dig);
        // is that even possible?
        if (ID.length() > 128) {
            throw new RuntimeException("ID larger than max 128 chars");
        }
        return ID;
    }

    public static String stripNonCharCodepoints(String input) {
        return INVALID_XML_CHARS.matcher(input).replaceAll("");
    }

    /**
     * Remove the non-cloudSearch-legal characters. Note that this might convert
     * two fields to the same name.
     * 
     * @see <a
     *      href="http://docs.aws.amazon.com/cloudsearch/latest/developerguide/configuring-index-fields.html">
     *      configuring-index-fields.html</a>
     * @param name
     * @return
     */
    public static String cleanFieldName(String name) {
        String lowercase = name.toLowerCase();
        lowercase = lowercase.replaceAll("[^a-z_0-9]", "_");
        if (lowercase.length() < 3 || lowercase.length() > 64)
            throw new RuntimeException("Field name must be between 3 and 64 chars : " + lowercase);
        if (lowercase.equals("score"))
            throw new RuntimeException("Field name must be score");
        return lowercase;
    }

}