org.eclipse.jgit.util.RawParseUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.eclipse.jgit.util.RawParseUtils.java

Source

/*
 * Copyright (C) 2008-2009, Google Inc.
 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
 * and other copyright owners as documented in the project's IP log.
 *
 * This program and the accompanying materials are made available
 * under the terms of the Eclipse Distribution License v1.0 which
 * accompanies this distribution, is reproduced below, and is
 * available at http://www.eclipse.org/org/documents/edl-v10.php
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above
 *   copyright notice, this list of conditions and the following
 *   disclaimer in the documentation and/or other materials provided
 *   with the distribution.
 *
 * - Neither the name of the Eclipse Foundation, Inc. nor the
 *   names of its contributors may be used to endorse or promote
 *   products derived from this software without specific prior
 *   written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.eclipse.jgit.util;

import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.eclipse.jgit.lib.ObjectChecker.author;
import static org.eclipse.jgit.lib.ObjectChecker.committer;
import static org.eclipse.jgit.lib.ObjectChecker.encoding;
import static org.eclipse.jgit.lib.ObjectChecker.tagger;

import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

import org.eclipse.jgit.annotations.Nullable;
import org.eclipse.jgit.errors.BinaryBlobException;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.PersonIdent;

/**
 * Handy utility functions to parse raw object contents.
 */
public final class RawParseUtils {
    /**
     * UTF-8 charset constant.
     *
     * @since 2.2
     * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead
     */
    @Deprecated
    public static final Charset UTF8_CHARSET = UTF_8;

    private static final byte[] digits10;

    private static final byte[] digits16;

    private static final byte[] footerLineKeyChars;

    private static final Map<String, Charset> encodingAliases;

    static {
        encodingAliases = new HashMap<>();
        encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
        encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$

        digits10 = new byte['9' + 1];
        Arrays.fill(digits10, (byte) -1);
        for (char i = '0'; i <= '9'; i++)
            digits10[i] = (byte) (i - '0');

        digits16 = new byte['f' + 1];
        Arrays.fill(digits16, (byte) -1);
        for (char i = '0'; i <= '9'; i++)
            digits16[i] = (byte) (i - '0');
        for (char i = 'a'; i <= 'f'; i++)
            digits16[i] = (byte) ((i - 'a') + 10);
        for (char i = 'A'; i <= 'F'; i++)
            digits16[i] = (byte) ((i - 'A') + 10);

        footerLineKeyChars = new byte['z' + 1];
        footerLineKeyChars['-'] = 1;
        for (char i = '0'; i <= '9'; i++)
            footerLineKeyChars[i] = 1;
        for (char i = 'A'; i <= 'Z'; i++)
            footerLineKeyChars[i] = 1;
        for (char i = 'a'; i <= 'z'; i++)
            footerLineKeyChars[i] = 1;
    }

    /**
     * Determine if b[ptr] matches src.
     *
     * @param b
     *            the buffer to scan.
     * @param ptr
     *            first position within b, this should match src[0].
     * @param src
     *            the buffer to test for equality with b.
     * @return ptr + src.length if b[ptr..src.length] == src; else -1.
     */
    public static final int match(byte[] b, int ptr, byte[] src) {
        if (ptr + src.length > b.length)
            return -1;
        for (int i = 0; i < src.length; i++, ptr++)
            if (b[ptr] != src[i])
                return -1;
        return ptr;
    }

    private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' };

    /**
     * Format a base 10 numeric into a temporary buffer.
     * <p>
     * Formatting is performed backwards. The method starts at offset
     * <code>o-1</code> and ends at <code>o-1-digits</code>, where
     * <code>digits</code> is the number of positions necessary to store the
     * base 10 value.
     * <p>
     * The argument and return values from this method make it easy to chain
     * writing, for example:
     * </p>
     *
     * <pre>
     * final byte[] tmp = new byte[64];
     * int ptr = tmp.length;
     * tmp[--ptr] = '\n';
     * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
     * tmp[--ptr] = ' ';
     * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
     * tmp[--ptr] = 0;
     * final String str = new String(tmp, ptr, tmp.length - ptr);
     * </pre>
     *
     * @param b
     *            buffer to write into.
     * @param o
     *            one offset past the location where writing will begin; writing
     *            proceeds towards lower index values.
     * @param value
     *            the value to store.
     * @return the new offset value <code>o</code>. This is the position of
     *         the last byte written. Additional writing should start at one
     *         position earlier.
     */
    public static int formatBase10(final byte[] b, int o, int value) {
        if (value == 0) {
            b[--o] = '0';
            return o;
        }
        final boolean isneg = value < 0;
        if (isneg)
            value = -value;
        while (value != 0) {
            b[--o] = base10byte[value % 10];
            value /= 10;
        }
        if (isneg)
            b[--o] = '-';
        return o;
    }

    /**
     * Parse a base 10 numeric from a sequence of ASCII digits into an int.
     * <p>
     * Digit sequences can begin with an optional run of spaces before the
     * sequence, and may start with a '+' or a '-' to indicate sign position.
     * Any other characters will cause the method to stop and return the current
     * result to the caller.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position within buffer to start parsing digits at.
     * @param ptrResult
     *            optional location to return the new ptr value through. If null
     *            the ptr value will be discarded.
     * @return the value at this location; 0 if the location is not a valid
     *         numeric.
     */
    public static final int parseBase10(final byte[] b, int ptr, final MutableInteger ptrResult) {
        int r = 0;
        int sign = 0;
        try {
            final int sz = b.length;
            while (ptr < sz && b[ptr] == ' ')
                ptr++;
            if (ptr >= sz)
                return 0;

            switch (b[ptr]) {
            case '-':
                sign = -1;
                ptr++;
                break;
            case '+':
                ptr++;
                break;
            }

            while (ptr < sz) {
                final byte v = digits10[b[ptr]];
                if (v < 0)
                    break;
                r = (r * 10) + v;
                ptr++;
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            // Not a valid digit.
        }
        if (ptrResult != null)
            ptrResult.value = ptr;
        return sign < 0 ? -r : r;
    }

    /**
     * Parse a base 10 numeric from a sequence of ASCII digits into a long.
     * <p>
     * Digit sequences can begin with an optional run of spaces before the
     * sequence, and may start with a '+' or a '-' to indicate sign position.
     * Any other characters will cause the method to stop and return the current
     * result to the caller.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position within buffer to start parsing digits at.
     * @param ptrResult
     *            optional location to return the new ptr value through. If null
     *            the ptr value will be discarded.
     * @return the value at this location; 0 if the location is not a valid
     *         numeric.
     */
    public static final long parseLongBase10(final byte[] b, int ptr, final MutableInteger ptrResult) {
        long r = 0;
        int sign = 0;
        try {
            final int sz = b.length;
            while (ptr < sz && b[ptr] == ' ')
                ptr++;
            if (ptr >= sz)
                return 0;

            switch (b[ptr]) {
            case '-':
                sign = -1;
                ptr++;
                break;
            case '+':
                ptr++;
                break;
            }

            while (ptr < sz) {
                final byte v = digits10[b[ptr]];
                if (v < 0)
                    break;
                r = (r * 10) + v;
                ptr++;
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            // Not a valid digit.
        }
        if (ptrResult != null)
            ptrResult.value = ptr;
        return sign < 0 ? -r : r;
    }

    /**
     * Parse 4 character base 16 (hex) formatted string to unsigned integer.
     * <p>
     * The number is read in network byte order, that is, most significant
     * nybble first.
     *
     * @param bs
     *            buffer to parse digits from; positions {@code [p, p+4)} will
     *            be parsed.
     * @param p
     *            first position within the buffer to parse.
     * @return the integer value.
     * @throws java.lang.ArrayIndexOutOfBoundsException
     *             if the string is not hex formatted.
     */
    public static final int parseHexInt16(final byte[] bs, final int p) {
        int r = digits16[bs[p]] << 4;

        r |= digits16[bs[p + 1]];
        r <<= 4;

        r |= digits16[bs[p + 2]];
        r <<= 4;

        r |= digits16[bs[p + 3]];
        if (r < 0)
            throw new ArrayIndexOutOfBoundsException();
        return r;
    }

    /**
     * Parse 8 character base 16 (hex) formatted string to unsigned integer.
     * <p>
     * The number is read in network byte order, that is, most significant
     * nybble first.
     *
     * @param bs
     *            buffer to parse digits from; positions {@code [p, p+8)} will
     *            be parsed.
     * @param p
     *            first position within the buffer to parse.
     * @return the integer value.
     * @throws java.lang.ArrayIndexOutOfBoundsException
     *             if the string is not hex formatted.
     */
    public static final int parseHexInt32(final byte[] bs, final int p) {
        int r = digits16[bs[p]] << 4;

        r |= digits16[bs[p + 1]];
        r <<= 4;

        r |= digits16[bs[p + 2]];
        r <<= 4;

        r |= digits16[bs[p + 3]];
        r <<= 4;

        r |= digits16[bs[p + 4]];
        r <<= 4;

        r |= digits16[bs[p + 5]];
        r <<= 4;

        r |= digits16[bs[p + 6]];

        final int last = digits16[bs[p + 7]];
        if (r < 0 || last < 0)
            throw new ArrayIndexOutOfBoundsException();
        return (r << 4) | last;
    }

    /**
     * Parse 16 character base 16 (hex) formatted string to unsigned long.
     * <p>
     * The number is read in network byte order, that is, most significant
     * nibble first.
     *
     * @param bs
     *            buffer to parse digits from; positions {@code [p, p+16)} will
     *            be parsed.
     * @param p
     *            first position within the buffer to parse.
     * @return the integer value.
     * @throws java.lang.ArrayIndexOutOfBoundsException
     *             if the string is not hex formatted.
     * @since 4.3
     */
    public static final long parseHexInt64(final byte[] bs, final int p) {
        long r = digits16[bs[p]] << 4;

        r |= digits16[bs[p + 1]];
        r <<= 4;

        r |= digits16[bs[p + 2]];
        r <<= 4;

        r |= digits16[bs[p + 3]];
        r <<= 4;

        r |= digits16[bs[p + 4]];
        r <<= 4;

        r |= digits16[bs[p + 5]];
        r <<= 4;

        r |= digits16[bs[p + 6]];
        r <<= 4;

        r |= digits16[bs[p + 7]];
        r <<= 4;

        r |= digits16[bs[p + 8]];
        r <<= 4;

        r |= digits16[bs[p + 9]];
        r <<= 4;

        r |= digits16[bs[p + 10]];
        r <<= 4;

        r |= digits16[bs[p + 11]];
        r <<= 4;

        r |= digits16[bs[p + 12]];
        r <<= 4;

        r |= digits16[bs[p + 13]];
        r <<= 4;

        r |= digits16[bs[p + 14]];

        final int last = digits16[bs[p + 15]];
        if (r < 0 || last < 0)
            throw new ArrayIndexOutOfBoundsException();
        return (r << 4) | last;
    }

    /**
     * Parse a single hex digit to its numeric value (0-15).
     *
     * @param digit
     *            hex character to parse.
     * @return numeric value, in the range 0-15.
     * @throws java.lang.ArrayIndexOutOfBoundsException
     *             if the input digit is not a valid hex digit.
     */
    public static final int parseHexInt4(final byte digit) {
        final byte r = digits16[digit];
        if (r < 0)
            throw new ArrayIndexOutOfBoundsException();
        return r;
    }

    /**
     * Parse a Git style timezone string.
     * <p>
     * The sequence "-0315" will be parsed as the numeric value -195, as the
     * lower two positions count minutes, not 100ths of an hour.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position within buffer to start parsing digits at.
     * @return the timezone at this location, expressed in minutes.
     */
    public static final int parseTimeZoneOffset(byte[] b, int ptr) {
        return parseTimeZoneOffset(b, ptr, null);
    }

    /**
     * Parse a Git style timezone string.
     * <p>
     * The sequence "-0315" will be parsed as the numeric value -195, as the
     * lower two positions count minutes, not 100ths of an hour.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position within buffer to start parsing digits at.
     * @param ptrResult
     *            optional location to return the new ptr value through. If null
     *            the ptr value will be discarded.
     * @return the timezone at this location, expressed in minutes.
     * @since 4.1
     */
    public static final int parseTimeZoneOffset(final byte[] b, int ptr, MutableInteger ptrResult) {
        final int v = parseBase10(b, ptr, ptrResult);
        final int tzMins = v % 100;
        final int tzHours = v / 100;
        return tzHours * 60 + tzMins;
    }

    /**
     * Locate the first position after a given character.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position within buffer to start looking for chrA at.
     * @param chrA
     *            character to find.
     * @return new position just after chrA.
     */
    public static final int next(byte[] b, int ptr, char chrA) {
        final int sz = b.length;
        while (ptr < sz) {
            if (b[ptr++] == chrA)
                return ptr;
        }
        return ptr;
    }

    /**
     * Locate the first position after the next LF.
     * <p>
     * This method stops on the first '\n' it finds.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position within buffer to start looking for LF at.
     * @return new position just after the first LF found.
     */
    public static final int nextLF(byte[] b, int ptr) {
        return next(b, ptr, '\n');
    }

    /**
     * Locate the first position after either the given character or LF.
     * <p>
     * This method stops on the first match it finds from either chrA or '\n'.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position within buffer to start looking for chrA or LF at.
     * @param chrA
     *            character to find.
     * @return new position just after the first chrA or LF to be found.
     */
    public static final int nextLF(byte[] b, int ptr, char chrA) {
        final int sz = b.length;
        while (ptr < sz) {
            final byte c = b[ptr++];
            if (c == chrA || c == '\n')
                return ptr;
        }
        return ptr;
    }

    /**
     * Locate the end of the header.  Note that headers may be
     * more than one line long.
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position within buffer to start looking for the end-of-header.
     * @return new position just after the header.  This is either
     * b.length, or the index of the header's terminating newline.
     * @since 5.1
     */
    public static final int headerEnd(final byte[] b, int ptr) {
        final int sz = b.length;
        while (ptr < sz) {
            final byte c = b[ptr++];
            if (c == '\n' && (ptr == sz || b[ptr] != ' ')) {
                return ptr - 1;
            }
        }
        return ptr - 1;
    }

    /**
     * Find the start of the contents of a given header.
     *
     * @param b
     *            buffer to scan.
     * @param headerName
     *            header to search for
     * @param ptr
     *            position within buffer to start looking for header at.
     * @return new position at the start of the header's contents, -1 for
     *         not found
     * @since 5.1
     */
    public static final int headerStart(byte[] headerName, byte[] b, int ptr) {
        // Start by advancing to just past a LF or buffer start
        if (ptr != 0) {
            ptr = nextLF(b, ptr - 1);
        }
        while (ptr < b.length - (headerName.length + 1)) {
            boolean found = true;
            for (int i = 0; i < headerName.length; i++) {
                if (headerName[i] != b[ptr++]) {
                    found = false;
                    break;
                }
            }
            if (found && b[ptr++] == ' ') {
                return ptr;
            }
            ptr = nextLF(b, ptr);
        }
        return -1;
    }

    /**
     * Locate the first position before a given character.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position within buffer to start looking for chrA at.
     * @param chrA
     *            character to find.
     * @return new position just before chrA, -1 for not found
     */
    public static final int prev(byte[] b, int ptr, char chrA) {
        if (ptr == b.length)
            --ptr;
        while (ptr >= 0) {
            if (b[ptr--] == chrA)
                return ptr;
        }
        return ptr;
    }

    /**
     * Locate the first position before the previous LF.
     * <p>
     * This method stops on the first '\n' it finds.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position within buffer to start looking for LF at.
     * @return new position just before the first LF found, -1 for not found
     */
    public static final int prevLF(byte[] b, int ptr) {
        return prev(b, ptr, '\n');
    }

    /**
     * Locate the previous position before either the given character or LF.
     * <p>
     * This method stops on the first match it finds from either chrA or '\n'.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position within buffer to start looking for chrA or LF at.
     * @param chrA
     *            character to find.
     * @return new position just before the first chrA or LF to be found, -1 for
     *         not found
     */
    public static final int prevLF(byte[] b, int ptr, char chrA) {
        if (ptr == b.length)
            --ptr;
        while (ptr >= 0) {
            final byte c = b[ptr--];
            if (c == chrA || c == '\n')
                return ptr;
        }
        return ptr;
    }

    /**
     * Index the region between <code>[ptr, end)</code> to find line starts.
     * <p>
     * The returned list is 1 indexed. Index 0 contains
     * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
     * <p>
     * Using a 1 indexed list means that line numbers can be directly accessed
     * from the list, so <code>list.get(1)</code> (aka get line 1) returns
     * <code>ptr</code>.
     * <p>
     * The last element (index <code>map.size()-1</code>) always contains
     * <code>end</code>.
     *
     * @param buf
     *            buffer to scan.
     * @param ptr
     *            position within the buffer corresponding to the first byte of
     *            line 1.
     * @param end
     *            1 past the end of the content within <code>buf</code>.
     * @return a line map indicating the starting position of each line.
     */
    public static final IntList lineMap(byte[] buf, int ptr, int end) {
        IntList map = new IntList((end - ptr) / 36);
        map.fillTo(1, Integer.MIN_VALUE);
        for (; ptr < end; ptr = nextLF(buf, ptr)) {
            map.add(ptr);
        }
        map.add(end);
        return map;
    }

    /**
     * Like {@link #lineMap(byte[], int, int)} but throw
     * {@link BinaryBlobException} if a NUL byte is encountered.
     *
     * @param buf
     *            buffer to scan.
     * @param ptr
     *            position within the buffer corresponding to the first byte of
     *            line 1.
     * @param end
     *            1 past the end of the content within <code>buf</code>.
     * @return a line map indicating the starting position of each line.
     * @throws BinaryBlobException
     *            if a NUL byte is found.
     * @since 5.0
     */
    public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end) throws BinaryBlobException {
        IntList map = lineMapOrNull(buf, ptr, end);
        if (map == null) {
            throw new BinaryBlobException();
        }
        return map;
    }

    @Nullable
    private static IntList lineMapOrNull(byte[] buf, int ptr, int end) {
        // Experimentally derived from multiple source repositories
        // the average number of bytes/line is 36. Its a rough guess
        // to initially size our map close to the target.
        IntList map = new IntList((end - ptr) / 36);
        map.add(Integer.MIN_VALUE);
        boolean foundLF = true;
        for (; ptr < end; ptr++) {
            if (foundLF) {
                map.add(ptr);
            }

            if (buf[ptr] == '\0') {
                return null;
            }

            foundLF = (buf[ptr] == '\n');
        }
        map.add(end);
        return map;
    }

    /**
     * Locate the "author " header line data.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position in buffer to start the scan at. Most callers should
     *            pass 0 to ensure the scan starts from the beginning of the
     *            commit buffer and does not accidentally look at message body.
     * @return position just after the space in "author ", so the first
     *         character of the author's name. If no author header can be
     *         located -1 is returned.
     */
    public static final int author(byte[] b, int ptr) {
        final int sz = b.length;
        if (ptr == 0)
            ptr += 46; // skip the "tree ..." line.
        while (ptr < sz && b[ptr] == 'p')
            ptr += 48; // skip this parent.
        return match(b, ptr, author);
    }

    /**
     * Locate the "committer " header line data.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position in buffer to start the scan at. Most callers should
     *            pass 0 to ensure the scan starts from the beginning of the
     *            commit buffer and does not accidentally look at message body.
     * @return position just after the space in "committer ", so the first
     *         character of the committer's name. If no committer header can be
     *         located -1 is returned.
     */
    public static final int committer(byte[] b, int ptr) {
        final int sz = b.length;
        if (ptr == 0)
            ptr += 46; // skip the "tree ..." line.
        while (ptr < sz && b[ptr] == 'p')
            ptr += 48; // skip this parent.
        if (ptr < sz && b[ptr] == 'a')
            ptr = nextLF(b, ptr);
        return match(b, ptr, committer);
    }

    /**
     * Locate the "tagger " header line data.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position in buffer to start the scan at. Most callers should
     *            pass 0 to ensure the scan starts from the beginning of the tag
     *            buffer and does not accidentally look at message body.
     * @return position just after the space in "tagger ", so the first
     *         character of the tagger's name. If no tagger header can be
     *         located -1 is returned.
     */
    public static final int tagger(byte[] b, int ptr) {
        final int sz = b.length;
        if (ptr == 0)
            ptr += 48; // skip the "object ..." line.
        while (ptr < sz) {
            if (b[ptr] == '\n')
                return -1;
            final int m = match(b, ptr, tagger);
            if (m >= 0)
                return m;
            ptr = nextLF(b, ptr);
        }
        return -1;
    }

    /**
     * Locate the "encoding " header line.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position in buffer to start the scan at. Most callers should
     *            pass 0 to ensure the scan starts from the beginning of the
     *            buffer and does not accidentally look at the message body.
     * @return position just after the space in "encoding ", so the first
     *         character of the encoding's name. If no encoding header can be
     *         located -1 is returned (and UTF-8 should be assumed).
     */
    public static final int encoding(byte[] b, int ptr) {
        final int sz = b.length;
        while (ptr < sz) {
            if (b[ptr] == '\n')
                return -1;
            if (b[ptr] == 'e')
                break;
            ptr = nextLF(b, ptr);
        }
        return match(b, ptr, encoding);
    }

    /**
     * Parse the "encoding " header as a string.
     * <p>
     * Locates the "encoding " header (if present) and returns its value.
     *
     * @param b
     *            buffer to scan.
     * @return the encoding header as specified in the commit; null if the
     *         header was not present and should be assumed.
     * @since 4.2
     */
    @Nullable
    public static String parseEncodingName(byte[] b) {
        int enc = encoding(b, 0);
        if (enc < 0) {
            return null;
        }
        int lf = nextLF(b, enc);
        return decode(UTF_8, b, enc, lf - 1);
    }

    /**
     * Parse the "encoding " header into a character set reference.
     * <p>
     * Locates the "encoding " header (if present) by first calling
     * {@link #encoding(byte[], int)} and then returns the proper character set
     * to apply to this buffer to evaluate its contents as character data.
     * <p>
     * If no encoding header is present {@code UTF-8} is assumed.
     *
     * @param b
     *            buffer to scan.
     * @return the Java character set representation. Never null.
     * @throws IllegalCharsetNameException
     *             if the character set requested by the encoding header is
     *             malformed and unsupportable.
     * @throws UnsupportedCharsetException
     *             if the JRE does not support the character set requested by
     *             the encoding header.
     */
    public static Charset parseEncoding(byte[] b) {
        String enc = parseEncodingName(b);
        if (enc == null) {
            return UTF_8;
        }

        String name = enc.trim();
        try {
            return Charset.forName(name);
        } catch (IllegalCharsetNameException | UnsupportedCharsetException badName) {
            Charset aliased = charsetForAlias(name);
            if (aliased != null) {
                return aliased;
            }
            throw badName;
        }
    }

    /**
     * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
     * <p>
     * Leading spaces won't be trimmed from the string, i.e. will show up in the
     * parsed name afterwards.
     *
     * @param in
     *            the string to parse a name from.
     * @return the parsed identity or null in case the identity could not be
     *         parsed.
     */
    public static PersonIdent parsePersonIdent(String in) {
        return parsePersonIdent(Constants.encode(in), 0);
    }

    /**
     * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
     * <p>
     * When passing in a value for <code>nameB</code> callers should use the
     * return value of {@link #author(byte[], int)} or
     * {@link #committer(byte[], int)}, as these methods provide the proper
     * position within the buffer.
     *
     * @param raw
     *            the buffer to parse character data from.
     * @param nameB
     *            first position of the identity information. This should be the
     *            first position after the space which delimits the header field
     *            name (e.g. "author" or "committer") from the rest of the
     *            identity line.
     * @return the parsed identity or null in case the identity could not be
     *         parsed.
     */
    public static PersonIdent parsePersonIdent(byte[] raw, int nameB) {
        Charset cs;
        try {
            cs = parseEncoding(raw);
        } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
            // Assume UTF-8 for person identities, usually this is correct.
            // If not decode() will fall back to the ISO-8859-1 encoding.
            cs = UTF_8;
        }

        final int emailB = nextLF(raw, nameB, '<');
        final int emailE = nextLF(raw, emailB, '>');
        if (emailB >= raw.length || raw[emailB] == '\n' || (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
            return null;

        final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ? emailB - 2 : emailB - 1;
        final String name = decode(cs, raw, nameB, nameEnd);
        final String email = decode(cs, raw, emailB, emailE - 1);

        // Start searching from end of line, as after first name-email pair,
        // another name-email pair may occur. We will ignore all kinds of
        // "junk" following the first email.
        //
        // We've to use (emailE - 1) for the case that raw[email] is LF,
        // otherwise we would run too far. "-2" is necessary to position
        // before the LF in case of LF termination resp. the penultimate
        // character if there is no trailing LF.
        final int tzBegin = lastIndexOfTrim(raw, ' ', nextLF(raw, emailE - 1) - 2) + 1;
        if (tzBegin <= emailE) // No time/zone, still valid
            return new PersonIdent(name, email, 0, 0);

        final int whenBegin = Math.max(emailE, lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
        if (whenBegin >= tzBegin - 1) // No time/zone, still valid
            return new PersonIdent(name, email, 0, 0);

        final long when = parseLongBase10(raw, whenBegin, null);
        final int tz = parseTimeZoneOffset(raw, tzBegin);
        return new PersonIdent(name, email, when * 1000L, tz);
    }

    /**
     * Parse a name data (e.g. as within a reflog) into a PersonIdent.
     * <p>
     * When passing in a value for <code>nameB</code> callers should use the
     * return value of {@link #author(byte[], int)} or
     * {@link #committer(byte[], int)}, as these methods provide the proper
     * position within the buffer.
     *
     * @param raw
     *            the buffer to parse character data from.
     * @param nameB
     *            first position of the identity information. This should be the
     *            first position after the space which delimits the header field
     *            name (e.g. "author" or "committer") from the rest of the
     *            identity line.
     * @return the parsed identity. Never null.
     */
    public static PersonIdent parsePersonIdentOnly(final byte[] raw, final int nameB) {
        int stop = nextLF(raw, nameB);
        int emailB = nextLF(raw, nameB, '<');
        int emailE = nextLF(raw, emailB, '>');
        final String name;
        final String email;
        if (emailE < stop) {
            email = decode(raw, emailB, emailE - 1);
        } else {
            email = "invalid"; //$NON-NLS-1$
        }
        if (emailB < stop)
            name = decode(raw, nameB, emailB - 2);
        else
            name = decode(raw, nameB, stop);

        final MutableInteger ptrout = new MutableInteger();
        long when;
        int tz;
        if (emailE < stop) {
            when = parseLongBase10(raw, emailE + 1, ptrout);
            tz = parseTimeZoneOffset(raw, ptrout.value);
        } else {
            when = 0;
            tz = 0;
        }
        return new PersonIdent(name, email, when * 1000L, tz);
    }

    /**
     * Locate the end of a footer line key string.
     * <p>
     * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
     * "Signed-off-by: A. U. Thor\n") then this method returns the position of
     * the first ':'.
     * <p>
     * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
     * then this method returns -1.
     *
     * @param raw
     *            buffer to scan.
     * @param ptr
     *            first position within raw to consider as a footer line key.
     * @return position of the ':' which terminates the footer line key if this
     *         is otherwise a valid footer line key; otherwise -1.
     */
    public static int endOfFooterLineKey(byte[] raw, int ptr) {
        try {
            for (;;) {
                final byte c = raw[ptr];
                if (footerLineKeyChars[c] == 0) {
                    if (c == ':')
                        return ptr;
                    return -1;
                }
                ptr++;
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            return -1;
        }
    }

    /**
     * Decode a buffer under UTF-8, if possible.
     *
     * If the byte stream cannot be decoded that way, the platform default is tried
     * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
     *
     * @param buffer
     *            buffer to pull raw bytes from.
     * @return a string representation of the range <code>[start,end)</code>,
     *         after decoding the region through the specified character set.
     */
    public static String decode(byte[] buffer) {
        return decode(buffer, 0, buffer.length);
    }

    /**
     * Decode a buffer under UTF-8, if possible.
     *
     * If the byte stream cannot be decoded that way, the platform default is
     * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
     *
     * @param buffer
     *            buffer to pull raw bytes from.
     * @param start
     *            start position in buffer
     * @param end
     *            one position past the last location within the buffer to take
     *            data from.
     * @return a string representation of the range <code>[start,end)</code>,
     *         after decoding the region through the specified character set.
     */
    public static String decode(final byte[] buffer, final int start, final int end) {
        return decode(UTF_8, buffer, start, end);
    }

    /**
     * Decode a buffer under the specified character set if possible.
     *
     * If the byte stream cannot be decoded that way, the platform default is tried
     * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
     *
     * @param cs
     *            character set to use when decoding the buffer.
     * @param buffer
     *            buffer to pull raw bytes from.
     * @return a string representation of the range <code>[start,end)</code>,
     *         after decoding the region through the specified character set.
     */
    public static String decode(Charset cs, byte[] buffer) {
        return decode(cs, buffer, 0, buffer.length);
    }

    /**
     * Decode a region of the buffer under the specified character set if possible.
     *
     * If the byte stream cannot be decoded that way, the platform default is tried
     * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
     *
     * @param cs
     *            character set to use when decoding the buffer.
     * @param buffer
     *            buffer to pull raw bytes from.
     * @param start
     *            first position within the buffer to take data from.
     * @param end
     *            one position past the last location within the buffer to take
     *            data from.
     * @return a string representation of the range <code>[start,end)</code>,
     *         after decoding the region through the specified character set.
     */
    public static String decode(final Charset cs, final byte[] buffer, final int start, final int end) {
        try {
            return decodeNoFallback(cs, buffer, start, end);
        } catch (CharacterCodingException e) {
            // Fall back to an ISO-8859-1 style encoding. At least all of
            // the bytes will be present in the output.
            //
            return extractBinaryString(buffer, start, end);
        }
    }

    /**
     * Decode a region of the buffer under the specified character set if
     * possible.
     *
     * If the byte stream cannot be decoded that way, the platform default is
     * tried and if that too fails, an exception is thrown.
     *
     * @param cs
     *            character set to use when decoding the buffer.
     * @param buffer
     *            buffer to pull raw bytes from.
     * @param start
     *            first position within the buffer to take data from.
     * @param end
     *            one position past the last location within the buffer to take
     *            data from.
     * @return a string representation of the range <code>[start,end)</code>,
     *         after decoding the region through the specified character set.
     * @throws java.nio.charset.CharacterCodingException
     *             the input is not in any of the tested character sets.
     */
    public static String decodeNoFallback(final Charset cs, final byte[] buffer, final int start, final int end)
            throws CharacterCodingException {
        ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
        b.mark();

        // Try our built-in favorite. The assumption here is that
        // decoding will fail if the data is not actually encoded
        // using that encoder.
        try {
            return decode(b, UTF_8);
        } catch (CharacterCodingException e) {
            b.reset();
        }

        if (!cs.equals(UTF_8)) {
            // Try the suggested encoding, it might be right since it was
            // provided by the caller.
            try {
                return decode(b, cs);
            } catch (CharacterCodingException e) {
                b.reset();
            }
        }

        // Try the default character set. A small group of people
        // might actually use the same (or very similar) locale.
        Charset defcs = Charset.defaultCharset();
        if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
            try {
                return decode(b, defcs);
            } catch (CharacterCodingException e) {
                b.reset();
            }
        }

        throw new CharacterCodingException();
    }

    /**
     * Decode a region of the buffer under the ISO-8859-1 encoding.
     *
     * Each byte is treated as a single character in the 8859-1 character
     * encoding, performing a raw binary-&gt;char conversion.
     *
     * @param buffer
     *            buffer to pull raw bytes from.
     * @param start
     *            first position within the buffer to take data from.
     * @param end
     *            one position past the last location within the buffer to take
     *            data from.
     * @return a string representation of the range <code>[start,end)</code>.
     */
    public static String extractBinaryString(final byte[] buffer, final int start, final int end) {
        final StringBuilder r = new StringBuilder(end - start);
        for (int i = start; i < end; i++)
            r.append((char) (buffer[i] & 0xff));
        return r.toString();
    }

    private static String decode(ByteBuffer b, Charset charset) throws CharacterCodingException {
        final CharsetDecoder d = charset.newDecoder();
        d.onMalformedInput(CodingErrorAction.REPORT);
        d.onUnmappableCharacter(CodingErrorAction.REPORT);
        return d.decode(b).toString();
    }

    /**
     * Locate the position of the commit message body.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position in buffer to start the scan at. Most callers should
     *            pass 0 to ensure the scan starts from the beginning of the
     *            commit buffer.
     * @return position of the user's message buffer.
     */
    public static final int commitMessage(byte[] b, int ptr) {
        final int sz = b.length;
        if (ptr == 0)
            ptr += 46; // skip the "tree ..." line.
        while (ptr < sz && b[ptr] == 'p')
            ptr += 48; // skip this parent.

        // Skip any remaining header lines, ignoring what their actual
        // header line type is. This is identical to the logic for a tag.
        //
        return tagMessage(b, ptr);
    }

    /**
     * Locate the position of the tag message body.
     *
     * @param b
     *            buffer to scan.
     * @param ptr
     *            position in buffer to start the scan at. Most callers should
     *            pass 0 to ensure the scan starts from the beginning of the tag
     *            buffer.
     * @return position of the user's message buffer.
     */
    public static final int tagMessage(byte[] b, int ptr) {
        final int sz = b.length;
        if (ptr == 0)
            ptr += 48; // skip the "object ..." line.
        while (ptr < sz && b[ptr] != '\n')
            ptr = nextLF(b, ptr);
        if (ptr < sz && b[ptr] == '\n')
            return ptr + 1;
        return -1;
    }

    /**
     * Locate the end of a paragraph.
     * <p>
     * A paragraph is ended by two consecutive LF bytes or CRLF pairs
     *
     * @param b
     *            buffer to scan.
     * @param start
     *            position in buffer to start the scan at. Most callers will
     *            want to pass the first position of the commit message (as
     *            found by {@link #commitMessage(byte[], int)}.
     * @return position of the LF at the end of the paragraph;
     *         <code>b.length</code> if no paragraph end could be located.
     */
    public static final int endOfParagraph(byte[] b, int start) {
        int ptr = start;
        final int sz = b.length;
        while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
            ptr = nextLF(b, ptr);
        if (ptr > start && b[ptr - 1] == '\n')
            ptr--;
        if (ptr > start && b[ptr - 1] == '\r')
            ptr--;
        return ptr;
    }

    /**
     * Get last index of {@code ch} in raw, trimming spaces.
     *
     * @param raw
     *            buffer to scan.
     * @param ch
     *            character to find.
     * @param pos
     *            starting position.
     * @return last index of {@code ch} in raw, trimming spaces.
     * @since 4.1
     */
    public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
        while (pos >= 0 && raw[pos] == ' ')
            pos--;

        while (pos >= 0 && raw[pos] != ch)
            pos--;

        return pos;
    }

    private static Charset charsetForAlias(String name) {
        return encodingAliases.get(StringUtils.toLowerCase(name));
    }

    private RawParseUtils() {
        // Don't create instances of a static only utility.
    }
}