IndexService.Indexer.java Source code

Java tutorial

Introduction

Here is the source code for IndexService.Indexer.java

Source

/**
* Tencent is pleased to support the open source community by making TDW available.
* Copyright (C) 2014 THL A29 Limited, a Tencent company. All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use 
* this file except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed 
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
* OF ANY KIND, either express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package IndexService;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import FormatStorage1.IColumnDataFile;
import FormatStorage1.IFormatDataFile;
import FormatStorage1.IRecord;

public class Indexer {
    public static final Log LOG = LogFactory.getLog(Indexer.class);

    private static Configuration conf = new Configuration();
    private static FileSystem fs;
    static {
        try {
            fs = FileSystem.get(conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    ArrayList<IRecord.IFValue> valuestart;
    ArrayList<IRecord.IFValue> valueend;
    int fieldnum;
    HashMap<String, IFormatDataFile> openedifdfs = new HashMap<String, IFormatDataFile>();
    HashMap<String, IColumnDataFile> openedicdfs = new HashMap<String, IColumnDataFile>();
    ArrayList<String> indexfilesrelated = new ArrayList<String>();
    int currentindexfile = -1;
    IFormatDataFile currentifdf;
    IRecord returnrecord = null;
    boolean recordgot = false;

    boolean returnallfield;
    ArrayList<Integer> idxs = null;
    IRecord indexrec = null;

    public Indexer(String indexdir, List<String> parts, List<IRecord.IFValue> startvalue,
            List<IRecord.IFValue> endvalue, String indexids, int fieldnum) throws IOException {
        this.fieldnum = fieldnum;
        if (indexids == null || indexids.length() <= 0) {
            returnallfield = true;
        } else {
            idxs = new ArrayList<Integer>();
            String[] strs = indexids.split(",");
            for (String str : strs) {
                idxs.add(Integer.parseInt(str.trim()));
            }
            returnallfield = false;
        }

        valuestart = new ArrayList<IRecord.IFValue>();
        for (IRecord.IFValue fv : startvalue) {
            valuestart.add((IRecord.IFValue) fv.clone());
        }
        valueend = new ArrayList<IRecord.IFValue>();
        for (IRecord.IFValue fv : endvalue) {
            valueend.add((IRecord.IFValue) fv.clone());
        }

        if (valuestart.size() != valueend.size()) {
            throw new IOException("the startvalue and the endvalue must have the same size");
        }

        for (int i = 0; i < valuestart.size(); i++) {
            if (valuestart.get(i).type().type() != valueend.get(i).type().type()) {
                throw new IOException(
                        "the start value and the end value must have the same type in every fieldvalue");
            }
        }

        indexfilesrelated.clear();
        indexdir = indexdir.endsWith("/") ? indexdir : (indexdir + "/");
        if (parts == null || parts.size() <= 0) {
            parts = getIndexPartName(indexdir);
        }

        for (String part : parts) {
            Path partpath = new Path(indexdir + part);
            if (fs.exists(partpath)) {
                FileStatus[] indexfiles = fs.listStatus(partpath);
                for (FileStatus fileStatus : indexfiles) {
                    indexfilesrelated.add(fileStatus.getPath().toString());
                }
            }
        }
        currentindexfile = 0;
        if (!initializeifdf()) {
            currentifdf = null;
        }
    }

    private boolean initializeifdf() throws IOException {
        for (; currentindexfile < indexfilesrelated.size(); currentindexfile++) {
            currentifdf = new IFormatDataFile(conf);
            currentifdf.open(indexfilesrelated.get(currentindexfile));
            indexrec = currentifdf.getIRecordObj();
            if (currentifdf.fileInfo().head().fieldMap().fieldtypes().size() < valuestart.size() + 2) {
                throw new IOException("input value field size is more than index can support");
            } else {
                boolean ok = true;
                for (int i = 0; i < valuestart.size(); i++) {
                    if (currentifdf.fileInfo().head().fieldMap().fieldtypes().get(i).type() != valuestart.get(i)
                            .type().type()) {
                        ok = false;
                        LOG.info("index type:\t"
                                + currentifdf.fileInfo().head().fieldMap().fieldtypes().get(i).type());
                        LOG.info("input type:\t" + valuestart.get(i).type().type());
                        throw new IOException("input value field type is not fit the index field type");
                    }
                }

                if (ok && currentifdf.seek(valuestart.get(0))) {
                    return true;
                }
            }
            currentifdf.close();
        }
        return false;
    }

    public Indexer(String indexdir, List<String> parts, List<IRecord.IFValue> values, String indexids, int fieldnum)
            throws IOException {
        this(indexdir, parts, values, values, indexids, fieldnum);
    }

    public boolean hasNext() throws Exception {
        if (recordgot) {
            return true;
        }

        while (true) {
            if (currentindexfile >= indexfilesrelated.size()) {
                return false;
            }
            if (currentifdf != null && getirecord(currentifdf, indexrec)) {
                break;
            } else {
                if (currentifdf != null)
                    currentifdf.close();
                currentifdf = null;
                currentindexfile++;
                if (!initializeifdf())
                    return false;
                indexrec = currentifdf.getIRecordObj();
            }
        }

        IRecord.IFValue ifv = indexrec.getByIdx(indexrec.fieldnum() - 2);
        int fileindex = (Short) ifv.data();
        int line = (Integer) indexrec.getByIdx(indexrec.fieldnum() - 1).data();
        String datafile = currentifdf.fileInfo().head().getUdi().infos().get(fileindex);
        boolean column = currentifdf.fileInfo().head().getUdi().infos().get(123456).equals("column");
        if (!column) {
            if (!openedifdfs.containsKey(datafile)) {
                IFormatDataFile ifdf = new IFormatDataFile(conf);
                ifdf.open(datafile);
                openedifdfs.put(datafile, ifdf);
            }
            IRecord record = openedifdfs.get(datafile).getByLine(line);
            returnrecord = record;
        } else {
            if (!openedicdfs.containsKey(datafile)) {
                IColumnDataFile icdf = new IColumnDataFile(conf);
                if (returnallfield)
                    icdf.open(datafile);
                else
                    icdf.open(datafile, idxs);
                openedicdfs.put(datafile, icdf);
            }
            IRecord record = openedicdfs.get(datafile).getByLine(line);
            returnrecord = record;
        }
        recordgot = true;
        return true;
    }

    private boolean getirecord(IFormatDataFile ifdf, IRecord irec) throws IOException {
        while (ifdf.next(irec)) {
            int flag = check(irec);
            if (flag == 0)
                return true;
            if (flag == 1)
                return false;
        }
        return false;
    }

    private int check(IRecord irec) {
        int compare = -1;
        if (irec.fieldnum() < valueend.size() + 2)
            return -1;
        for (int i = 0; i < valuestart.size(); i++) {
            compare = irec.getByIdx(i).compareTo(valuestart.get(i));
            if (compare < 0)
                return -1;
            if (compare > 0)
                break;
        }
        for (int i = 0; i < valueend.size(); i++) {
            compare = irec.getByIdx(i).compareTo(valueend.get(i));
            if (compare > 0)
                return 1;
            if (compare < 0)
                break;
        }
        return 0;
    }

    public IRecord next() throws Exception {
        if (recordgot) {
            recordgot = false;
            return returnrecord;
        }
        if (hasNext())
            return returnrecord;
        return null;
    }

    public void close() {
        try {
            if (currentifdf != null) {
                currentifdf.close();
                currentifdf = null;
            }
            if (openedifdfs != null)
                for (IFormatDataFile ifdf : openedifdfs.values()) {
                    if (ifdf != null) {
                        ifdf.close();
                    }
                }
            if (openedicdfs != null)
                for (IColumnDataFile icdf : openedicdfs.values()) {
                    if (icdf != null) {
                        icdf.close();
                    }
                }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public Indexer() {
    }

    public int recordsnum(String indexdir, List<String> parts, List<IRecord.IFValue> startvalue)
            throws IOException {
        return recordsnum(indexdir, parts, startvalue, startvalue);
    }

    public int recordsnum(String indexdir, List<String> parts, List<IRecord.IFValue> startvalue,
            List<IRecord.IFValue> endvalue) throws IOException {
        int num = 0;
        if (parts == null || parts.size() <= 0) {
            parts = getIndexPartName(indexdir);
        }
        indexdir = indexdir.endsWith("/") ? indexdir : (indexdir + "/");
        List<IRecord.IFValue> valstart = new ArrayList<IRecord.IFValue>();
        for (IRecord.IFValue fv : startvalue) {
            valstart.add(fv.clone());
        }
        List<IRecord.IFValue> valend = new ArrayList<IRecord.IFValue>();
        for (IRecord.IFValue fv : endvalue) {
            valend.add(fv.clone());
        }

        if (valstart.size() != valend.size()) {
            throw new IOException("the startvalue and the endvalue must have the same size");
        }

        for (int i = 0; i < valstart.size(); i++) {
            if (valstart.get(i).type().type() != valend.get(i).type().type()) {
                throw new IOException(
                        "the start value and the end value must have the same type in every fieldvalue");
            }
        }

        for (String partname : parts) {
            Path partfile = new Path(indexdir + partname);
            FileStatus[] indexfiles = fs.listStatus(partfile);
            for (FileStatus indexfile : indexfiles) {
                IFormatDataFile ifdf = new IFormatDataFile(conf);
                ifdf.open(indexfile.getPath().toString());
                List<IndexValue> res = getIndexResRange(ifdf, valstart, valend, -1);
                ifdf.close();
                num += res.size();
            }
        }
        return num;
    }

    public List<IRecord> get(String indexdir, List<String> parts, List<IRecord.IFValue> values, int limit,
            String indexids, int fieldnum) throws Exception {
        return getRange(indexdir, parts, values, values, limit, indexids, fieldnum);
    }

    private List<String> getIndexPartName(String indexdir) throws IOException {
        List<String> result = new ArrayList<String>();
        Path indexpartdir = new Path(indexdir);
        if (fs.exists(indexpartdir)) {
            FileStatus[] fss = fs.listStatus(indexpartdir);
            for (FileStatus status : fss) {
                result.add(status.getPath().getName());
            }
        }
        return result;
    }

    private List<IRecord> getRecord(TreeMap<String, TreeSet<Integer>> indexresult,
            HashMap<String, Boolean> iscolumn, String indexids, int limit, int fieldnum) throws IOException {
        long time = System.currentTimeMillis();
        List<IRecord> result = new ArrayList<IRecord>();
        boolean returnallfields = false;
        String[] idxids;
        ArrayList<Integer> idxs = new ArrayList<Integer>();
        if (indexids == null) {
            returnallfields = true;
            idxids = new String[0];
        } else {
            idxids = indexids.split(",");
            for (int i = 0; i < idxids.length; i++) {
                idxs.add(Integer.parseInt(idxids[i].trim()));
            }
        }
        int i = 0;
        label: for (String file : indexresult.keySet()) {
            if (!iscolumn.get(file)) {
                IFormatDataFile ifdf = new IFormatDataFile(conf);
                if (returnallfields)
                    ifdf.open(file);
                else
                    ifdf.open(file, idxs);
                for (Integer line : indexresult.get(file)) {
                    if (limit >= 0 && i >= limit) {
                        ifdf.close();
                        break label;
                    }
                    IRecord rec = ifdf.getByLine(line);
                    result.add(rec);
                    i++;
                }
                ifdf.close();
            } else {
                IColumnDataFile icdf = new IColumnDataFile(conf);
                if (returnallfields)
                    icdf.open(file);
                else
                    icdf.open(file, idxs);
                for (Integer line : indexresult.get(file)) {
                    if (limit >= 0 && i >= limit) {
                        icdf.close();
                        break label;
                    }
                    IRecord rec = icdf.getByLine(line);
                    result.add(rec);
                    i++;
                }
                icdf.close();
            }
        }
        System.out.println("getRecord time:\t" + (System.currentTimeMillis() - time) / 1000 + "s");
        return result;
    }

    public List<IRecord> getRange(String indexdir, List<String> parts, List<IRecord.IFValue> startvalue,
            List<IRecord.IFValue> endvalue, int limit, String indexids, int fieldnum) throws IOException {

        ArrayList<IRecord.IFValue> valuestart1 = new ArrayList<IRecord.IFValue>();
        for (IRecord.IFValue fv : startvalue) {
            valuestart1.add(fv.clone());
        }
        ArrayList<IRecord.IFValue> valueend1 = new ArrayList<IRecord.IFValue>();
        for (IRecord.IFValue fv : endvalue) {
            valueend1.add(fv.clone());
        }

        if (valuestart1.size() != valueend1.size()) {
            throw new IOException("the startvalue and the endvalue must have the same size");
        }

        for (int i = 0; i < valuestart1.size(); i++) {
            if (valuestart1.get(i).type().type() != valueend1.get(i).type().type()) {
                throw new IOException(
                        "the start value and the end value must have the same type in every fieldvalue");
            }
        }

        if (parts == null || parts.size() <= 0) {
            parts = getIndexPartName(indexdir);
        }

        List<IRecord> result = new ArrayList<IRecord>();
        for (String part : parts) {
            if (limit > 0 && limit - result.size() <= 0)
                break;
            result.addAll(
                    getRange1(indexdir, part, valuestart1, valueend1, indexids, limit - result.size(), fieldnum));
        }
        return result;
    }

    private List<IRecord> getRange1(String indexdir, String partname, List<IRecord.IFValue> startvalue,
            List<IRecord.IFValue> endvalue, String indexids, int limit, int fieldnum) throws IOException {
        String dir = indexdir.endsWith("/") ? indexdir : (indexdir + "/");
        Path partfile = new Path(dir + partname);

        TreeMap<String, TreeSet<Integer>> indexresult = new TreeMap<String, TreeSet<Integer>>();
        HashMap<String, Boolean> iscolumn = new HashMap<String, Boolean>();
        FileStatus[] fss = fs.listStatus(partfile);
        long time = System.currentTimeMillis();
        for (FileStatus status : fss) {
            IFormatDataFile ifdf = new IFormatDataFile(conf);
            ifdf.open(status.getPath().toString());

            if (ifdf.fileInfo().head().fieldMap().fieldtypes().size() < startvalue.size() + 2) {
                throw new IOException("input value field size is more than index can support");
            }
            for (int i = 0; i < startvalue.size(); i++) {
                if (ifdf.fileInfo().head().fieldMap().fieldtypes().get(i).type() != startvalue.get(i).type()
                        .type()) {
                    throw new IOException("input value field type is not fit the index field type");
                }
            }

            HashMap<Integer, String> infos = ifdf.fileInfo().head().getUdi().infos();
            List<IndexValue> res = getIndexResRange(ifdf, startvalue, endvalue, limit);
            for (IndexValue iv : res) {
                int fileid = iv.getFileindex();
                String filename = infos.get(fileid);
                if (!indexresult.containsKey(filename)) {
                    indexresult.put(filename, new TreeSet<Integer>());
                    iscolumn.put(filename, ifdf.fileInfo().head().getUdi().infos().get(123456).equals("column"));

                }
                indexresult.get(filename).add(iv.getRowid());
            }
            ifdf.close();
        }
        System.out.println("getIndexResRange time:\t" + (System.currentTimeMillis() - time) / 1000 + "s");
        System.out.println("related file num:\t" + indexresult.size());
        return getRecord(indexresult, iscolumn, indexids, limit, fieldnum);
    }

    private List<IndexValue> getIndexResRange(IFormatDataFile ifdf, List<IRecord.IFValue> valstart,
            List<IRecord.IFValue> valend, int limit) throws IOException {
        List<IndexValue> result = new ArrayList<IndexValue>();
        if (valstart == null || valstart.size() <= 0)
            return result;
        IRecord.IFValue fv = valstart.get(0);
        if (!ifdf.seek(fv)) {
            return result;
        }
        int recnum = 0;
        int compare = 0;
        while (compare <= 0) {
            if (limit > 0 && recnum >= limit)
                break;
            IRecord record = ifdf.getIRecordObj();
            if (!ifdf.next(record)) {
                break;
            }

            for (int i = 0; i < valstart.size(); i++) {
                compare = record.getByIdx(i).compareTo(valstart.get(i));
                if (compare != 0)
                    break;
            }
            if (compare < 0)
                continue;
            for (int i = 0; i < valend.size(); i++) {
                compare = record.getByIdx(i).compareTo(valend.get(i));
                if (compare != 0)
                    break;
            }
            if (compare <= 0) {
                int fileindex = (Short) record.getByIdx(record.fieldnum() - 2).data();
                int line = (Integer) record.getByIdx(record.fieldnum() - 1).data();
                IndexValue iv = new IndexValue(fileindex, line);
                result.add(iv);
                recnum++;
            }
        }
        return result;
    }

    static void test() throws IOException {
        String indexdir = "/se/index/indextest1/testformat";
        ArrayList<IRecord.IFValue> values = new ArrayList<IRecord.IFValue>();
        values.add(new IRecord.IFValue(100, 2));
        ArrayList<IRecord.IFValue> values1 = new ArrayList<IRecord.IFValue>();
        values1.add(new IRecord.IFValue(10333443, 2));
        ArrayList<Integer> idxs = new ArrayList<Integer>();
        idxs.add(0);
        idxs.add(3);

        Indexer indexer = new Indexer();
        List<IRecord> recs = indexer.getRange(indexdir, null, values, values1, -1, "0,2,3,4", -1);
        for (IRecord rec : recs) {
            rec.show();
        }

    }

    public static void main(String[] args) throws IOException {
        System.out.println("input cmd:   indexdir  idx  type  startvalue  endvalue");
        String str = new BufferedReader(new InputStreamReader(System.in)).readLine();
        StringTokenizer st = new StringTokenizer(str);
        String indexdir = st.nextToken();
        int idx = Integer.parseInt(st.nextToken());
        String type = st.nextToken();
        String startvalue = st.nextToken();
        String endvalue = null;
        if (st.hasMoreTokens())
            endvalue = st.nextToken();
        else
            endvalue = startvalue;
        ArrayList<IRecord.IFValue> values = new ArrayList<IRecord.IFValue>();
        if (type.equalsIgnoreCase("byte"))
            values.add(new IRecord.IFValue(Byte.parseByte(startvalue), idx));
        if (type.equalsIgnoreCase("short"))
            values.add(new IRecord.IFValue(Short.parseShort(startvalue), idx));
        if (type.equalsIgnoreCase("int"))
            values.add(new IRecord.IFValue(Integer.parseInt(startvalue), idx));
        if (type.equalsIgnoreCase("long"))
            values.add(new IRecord.IFValue(Long.parseLong(startvalue), idx));
        if (type.equalsIgnoreCase("float"))
            values.add(new IRecord.IFValue(Float.parseFloat(startvalue), idx));
        if (type.equalsIgnoreCase("double"))
            values.add(new IRecord.IFValue(Double.parseDouble(startvalue), idx));
        if (type.equalsIgnoreCase("string"))
            values.add(new IRecord.IFValue(startvalue, idx));
        ArrayList<IRecord.IFValue> values1 = new ArrayList<IRecord.IFValue>();
        if (type.equalsIgnoreCase("byte"))
            values1.add(new IRecord.IFValue(Byte.parseByte(endvalue), idx));
        if (type.equalsIgnoreCase("short"))
            values1.add(new IRecord.IFValue(Short.parseShort(endvalue), idx));
        if (type.equalsIgnoreCase("int"))
            values1.add(new IRecord.IFValue(Integer.parseInt(endvalue), idx));
        if (type.equalsIgnoreCase("long"))
            values1.add(new IRecord.IFValue(Long.parseLong(endvalue), idx));
        if (type.equalsIgnoreCase("float"))
            values1.add(new IRecord.IFValue(Float.parseFloat(endvalue), idx));
        if (type.equalsIgnoreCase("double"))
            values1.add(new IRecord.IFValue(Double.parseDouble(endvalue), idx));
        if (type.equalsIgnoreCase("string"))
            values1.add(new IRecord.IFValue(endvalue, idx));
        Indexer indexer = new Indexer();
        List<IRecord> recs = indexer.getRange(indexdir, null, values, values1, 100, null, -1);
        for (IRecord rec : recs) {
            rec.show();
        }

    }
}