data_gen.Data_gen.java Source code

Java tutorial

Introduction

Here is the source code for data_gen.Data_gen.java

Source

/**
 *
 * @author jhilanalkarawi
 *
 *
 * Copyright 2014 Brainspace Corp.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 * This file uses JACKSON JSON Library licensed under the apache license
 * http://jackson.codehaus.org/
 *
 *
 * This is the main class and methods for generating a dataset, specifications
 * and fields parsed from the configuration file fields parsed into map, then
 * map populated with values according to the type of the field,then the
 * generated document printed out to the json or .dat file
 *
 */
package data_gen;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Scanner;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;

public class Data_gen {

    public static String Default_DataSet_name;
    public static long documents_count;
    public static long Docments_Total_size;
    public static String output_dir;
    public static long total = 0;
    public static long document_size = 0;
    public static long x = 0;
    public static long x2 = 0;
    public static long cnt = 0;
    public static long count_check = 1000;
    public static int file_index = 0;
    public static HashMap<String, Object> fields = new LinkedHashMap<>();
    public static HashMap<String, Object> schema_fields = new LinkedHashMap<>();
    public static List<String> tx = new ArrayList<>();
    public static File[] listOfFiles;
    public static long max = 0;
    public static long min = 0;

    public static void main(String[] args) throws FileNotFoundException, IOException {
        long startTime = System.nanoTime();
        if (args.length < 2) {
            System.out.println("Usage:");
            System.out.println(
                    "java -jar \"jarfile\" [Directory of text source folder] [Dierctory of configration file]"
                            + "\n");
            System.exit(0);
        }

        String Dir = args[0]; // get text source dir from user
        String config_dir = args[1];
        File folder = new File(Dir);
        if (folder.isDirectory() == false) {
            System.out.println("Text souce folder is not a Directory." + "\n");
            System.exit(0);
        }
        if (!config_dir.endsWith(".properties") && !config_dir.endsWith(".PROPERTIES")) {
            System.out.println("\n"
                    + "There was error parsing dataset parameters from configuration file, make sure you have the 4 parameters specified and the right type of file"
                    + "\n");
            System.exit(0);
        }

        listOfFiles = folder.listFiles(new FilenameFilter() {
            @Override
            public boolean accept(File dir, String name) {
                return name.toLowerCase().endsWith(".txt");
            }
        });

        if (listOfFiles.length == 0) {
            System.out.println("Text source folder is empty ! Have at least one .txt file there" + "\n");
            System.exit(0);
        }

        System.out.println("\n");
        Parse_Document_values(config_dir);// parse config file to get class attribute values
        document_size = Docments_Total_size / documents_count; // to get each document size 
        max = (long) ((double) document_size * 1.8);
        min = (long) ((double) document_size * 0.2);

        schema_fields = Parse_Document_fields(config_dir);

        try {
            LineIterator it = FileUtils.lineIterator(listOfFiles[0]);

            while (it.hasNext()) {
                tx.add(it.nextLine());
            }
        } catch (NullPointerException | FileNotFoundException e) {
            System.out.println("The text source file could not be found." + "\n");
            System.exit(0);
        }

        new File(output_dir).mkdir();
        //////////////////////////////////////////////////////////////// build json or .dat
        ////////////////////////////////////////////////////////////////////     
        if (Default_DataSet_name.endsWith(".json")) {
            Build_json_file(config_dir, startTime);
        }

        if (Default_DataSet_name.endsWith(".dat")) {
            Build_dat_file(config_dir, startTime);
        }

        generate_xml();
        generate_field_map();

    }
    //////////////////////////////////////////////////////////////////// methods
    ////////////////////////////////////////////////////////////////////

    public static LinkedHashMap<String, Object> Parse_Document_fields(String config)
            throws FileNotFoundException, IOException {
        LinkedHashMap<String, Object> fields = new LinkedHashMap<>();
        File file = new File(config);
        FileReader fr = null;
        BufferedReader br = null;
        try {
            fr = new FileReader(file);
            br = new BufferedReader(fr);
            while (br.ready()) {
                String temp = br.readLine();
                if (temp.startsWith("\"")) {
                    String[] t = temp.split("=");
                    String trim = t[0].substring(1, t[0].length() - 1).trim();
                    fields.put(trim, t[1].trim());
                }
            }
            br.close();
        } catch (FileNotFoundException e) {
            //e.printStackTrace();
            System.out.println("The file could not be found.");
        } catch (IOException e) {
            // e.printStackTrace();
            System.out.println("There was an error reading the file.");
        }
        return fields;
    }

    public static void Parse_Document_values(String config) throws FileNotFoundException, IOException {
        File file = new File(config);
        FileReader fr = null;
        BufferedReader br = null;
        try {
            fr = new FileReader(file);
            br = new BufferedReader(fr);
            while (br.ready()) {
                String temp = br.readLine();
                String[] t = temp.split("=");

                if (t[0].equalsIgnoreCase("output_name")) {
                    Default_DataSet_name = t[1].trim();
                }
                if (t[0].equalsIgnoreCase("documents_count")) {
                    documents_count = Long.parseLong(t[1].trim());
                }
                if (t[0].equalsIgnoreCase("dataset_size")) {
                    Docments_Total_size = Long.parseLong(t[1].trim());
                }
                if (t[0].equalsIgnoreCase("output_dir")) {
                    output_dir = t[1].trim();

                }
            }

            br.close();
        } catch (FileNotFoundException e) {
            //e.printStackTrace();
            System.out.println("The configration file could not be found.");
            System.exit(0);
        } catch (IOException e) {
            // e.printStackTrace();
            System.out.println("There was an error reading the configration file.");
            System.exit(0);
        } catch (NullPointerException e) {
            // e.printStackTrace();
            System.out.println("There was an error reading the configration file.");
            System.exit(0);
        } catch (NumberFormatException e) {
            // e.printStackTrace();
            System.out.println("There was an error reading the configration file.");
            System.exit(0);
        }

        if (Default_DataSet_name == null || documents_count <= 0 || Docments_Total_size <= 0
                || output_dir == null) {
            System.out.println("There was an error reading the configration file\n"
                    + "make sure you have the following parametes specified EX:\n" + "documents_count=25000\n"
                    + "dataset_size=10000000\n" + "output_name=output.json\n" + "output_dir=./dataset_one\n");
            System.exit(0);
        }

    }

    public static String get_text_Dir() {
        Scanner s = new Scanner(System.in);
        System.out.println("Enter the data source Directory: " + "\n");
        String Dir = s.nextLine().trim();
        return Dir;
    }

    private static void integer_key(HashMap<String, Object> fields, String key, String value) {
        if (x == 0) {
            try {
                String[] int_extract = value.split("<|>");
                int start = Math.abs(Integer.parseInt(int_extract[1]));
                x = start;
            } catch (NumberFormatException | NullPointerException | ArrayIndexOutOfBoundsException e) {
                System.out.println(
                        "Integer.key start value is incorrect. make sure it's positive integer between <> signs in configuration file.");
                System.exit(0);
            }

        }
        if (value.startsWith("integer.key")) {
            String id = String.valueOf((x++));
            fields.put(key, id);
        }
    }

    private static void seq_integer(HashMap<String, Object> fields, String key, String value) {
        if (x2 == 0) {
            try {
                String[] int_extract = value.split("<|>");
                int start = Integer.parseInt(int_extract[1]);
                x2 = start;
            } catch (NumberFormatException | NullPointerException | ArrayIndexOutOfBoundsException e) {
                System.out.println(
                        "Integer.key start value is incorrect. make sure it's positive integer between <> signs in configuration file.");
                System.exit(0);
            }
        }
        if (value.startsWith("seq.integer")) {
            String id = String.valueOf((x2++));
            fields.put(key, id);
        }
    }

    private static void range_integer(HashMap<String, Object> fields, String key, String value) {

        int max = 0;
        int min = 0;
        Random rand = new Random();
        try {
            String[] values = value.split(":");
            String trim_val = values[1].substring(1, values[1].length() - 1);
            String[] range_values = trim_val.split(",");
            if (range_values[0].equalsIgnoreCase("min") && range_values[1].equalsIgnoreCase("max")) {
                min = 1;
                max = Integer.MAX_VALUE;

            } else {
                min = Integer.parseInt(range_values[0]);
                max = Integer.parseInt(range_values[1]);
            }

            String val = String.valueOf(rand.nextInt(max - min + 1) + min);
            fields.put(key, val);
        } catch (ArrayIndexOutOfBoundsException e) {
            System.out.println("Make sure you have the right integer range format in configration file");
            System.out.println("example    range:[3,30]" + "\n");
            System.exit(0);

        }

    }

    private static void single_enum(HashMap<String, Object> fields, String key, String value) {
        String trim_val = value.substring(1, value.length() - 1);
        String[] values = trim_val.split(",");
        Random rand = new Random();
        int r = rand.nextInt(values.length - 1);
        String final_value = values[r];
        fields.put(key, final_value);
    }

    private static void multi_enum_json(HashMap<String, Object> fields, String key, String value) {
        try {
            String[] values = value.split(":");
            String trim_val = values[1].substring(1, values[1].length() - 1);
            String[] possible_values = trim_val.split(",");
            LinkedList<String> all = new LinkedList(Arrays.asList(possible_values));
            Random rand = new Random();
            Random rand2 = new Random();
            String result = "";
            int r = rand.nextInt(possible_values.length - 1);
            if (r == 0) {
                r = 1;
            }
            String[] words = new String[r];
            for (int i = 0; i < r; i++) {
                int r2 = rand.nextInt(all.size() - 1);
                words[i] = all.get(r2);
                all.remove(r2);
            }
            List<String> myList = new ArrayList<>(Arrays.asList(words));

            fields.put(key, myList);
        } catch (ArrayIndexOutOfBoundsException e) {
            System.out.println("Make sure you have the right multi value enumeration format in configration file");
            System.out.println("example:    multi:[science,sport,art,literature,politics]" + "\n");
            System.exit(0);

        }

    }

    private static void multi_enum_dat(HashMap<String, Object> fields, String key, String value) {
        try {
            String[] values = value.split(":");
            String trim_val = values[1].substring(1, values[1].length() - 1);
            String[] possible_values = trim_val.split(",");
            LinkedList<String> all = new LinkedList(Arrays.asList(possible_values));
            Random rand = new Random();
            Random rand2 = new Random();
            String result = "";
            int r = rand.nextInt(possible_values.length - 1);
            if (r == 0) {
                r = 1;
            }
            String[] words = new String[r];
            for (int i = 0; i < r; i++) {
                int r2 = rand.nextInt(all.size() - 1);
                words[i] = all.get(r2);
                all.remove(r2);
            }
            String multi_values = "";
            for (int i = 0; i < words.length; i++) {
                multi_values += words[i] + (char) 59;
            }
            multi_values = multi_values.substring(0, multi_values.length() - 1);
            fields.put(key, multi_values);
        } catch (ArrayIndexOutOfBoundsException e) {
            System.out.println("Make sure you have the right multi value enumeration format in configration file");
            System.out.println("example:    multi:[science,sport,art,literature,politics]" + "\n");
            System.exit(0);

        }

    }

    private static void generate_Text_json(HashMap<String, Object> fields, String key) throws IOException {

        String temp = "";
        long doc_size = 0;
        Random rand = new Random();
        long text_length = rand.nextInt((int) ((max - min + 1) + min));

        while (doc_size < text_length) {
            temp += tx.remove(0) + " "; // todo: new line character beeing render as text instead of control
            doc_size = temp.length();

            if (tx.size() == 1) {
                switch_file(tx);
            }
        }
        temp = temp.trim();
        total += temp.length(); // count total documents size
        fields.put(key, temp);

    }

    private static void generate_Text_dat(HashMap<String, Object> fields, String key) throws IOException {

        String temp = "";
        long doc_size = 0;
        Random rand = new Random();
        long text_length = rand.nextInt((int) ((max - min + 1) + min));

        while (doc_size < text_length) {
            temp += tx.remove(0) + (char) 174; // todo: new line charachter beeing render as text instead of control
            doc_size = temp.length();

            if (tx.size() == 1) {
                switch_file(tx);
            }
        }
        temp = temp.trim();
        total += temp.length(); // count total documents size
        fields.put(key, temp);

    }

    private static void generate_date(HashMap<String, Object> fields, String key, String value) {

        try {
            String[] values = value.split(":");
            String trim_val = values[1].substring(1, values[1].length() - 1);
            String range[] = trim_val.split(",");

            for (int i = 0; i < 2; i++) {
                String[] tokens = range[i].split("-");
                if (tokens.length == 1) {
                    Random rand = new Random();
                    String month = String.valueOf(rand.nextInt(11) + 1);
                    String day = String.valueOf(rand.nextInt(27) + 1);
                    if (month.length() == 1) {
                        month = "0" + month;
                    }
                    if (day.length() == 1) {
                        day = "0" + day;
                    }
                    range[i] += "-" + month + "-" + day;
                }

                if (tokens.length == 2) {
                    Random rand = new Random();
                    String day = String.valueOf(rand.nextInt(27) + 1);
                    if (day.length() == 1) {
                        day = "0" + day;
                    }
                    range[i] += "-" + day;
                }
            }
            ////////////////////////////////////////////   
            range[0] += " 00:00:00";
            range[1] += " 23:59:59";
            long begin = Timestamp.valueOf(range[0]).getTime();
            long end = Timestamp.valueOf(range[1]).getTime();
            long diff = end - begin + 1;
            Timestamp rand = new Timestamp(begin + (long) (Math.random() * diff));
            SimpleDateFormat date = new SimpleDateFormat("yyyy-M-d HH:mm:ss");
            String time = date.format(rand);

            fields.put(key, time);

        } catch (ArrayIndexOutOfBoundsException e) {
            System.out.println("Make sure you have the right date range format in configration file");
            System.out.println("example:    date:[2013-04-07,2014-09]" + "\n");
            System.exit(0);
        }

    }

    public static String get_config_Dir() {
        Scanner s = new Scanner(System.in);
        System.out.println("\n" + "Enter the name or Directory of the configration file to be used: " + "\n");
        String Dir = s.nextLine().trim();
        return Dir;
    }

    private static void switch_file(List<String> tx) throws IOException {

        if (listOfFiles.length == 1) {
            file_index = 0;
            LineIterator it = FileUtils.lineIterator(listOfFiles[file_index]);
            while (it.hasNext()) {
                tx.add(it.nextLine());
            }
        }

        if (listOfFiles.length > 1) {
            ++file_index;

            LineIterator it = FileUtils.lineIterator(listOfFiles[file_index]);
            while (it.hasNext()) {
                tx.add(it.nextLine());
            }

            if (file_index == listOfFiles.length - 1) {
                file_index = 0;
            }

        }

    }

    public static void generate_xml() {
        SchemaGenerator schema = new SchemaGenerator(schema_fields);
        try {

            String content = schema.generateSchema();

            File file = new File(output_dir + "/" + "schema.xml");

            if (!file.exists()) {
                file.createNewFile();
            }

            FileWriter fw = new FileWriter(file.getAbsoluteFile());
            BufferedWriter bw = new BufferedWriter(fw);
            bw.write(content);
            bw.close();

        } catch (IOException e) {
            System.out.println("There was an error writing to the xml file." + "\n");
        }

    }

    public static void generate_field_map() {
        try {
            File file = new File(output_dir + "/" + "fieldMap.xml");

            if (!file.exists()) {
                file.createNewFile();
            }
            FileWriter fw = new FileWriter(file.getAbsoluteFile());
            BufferedWriter bw = new BufferedWriter(fw);
            bw.write("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "\n");
            bw.write("<fieldMap name=\"Default Field Map\" version=\"1.0\">" + "\n");
            bw.write("     <!--" + "\n");
            bw.write("     There are no field maps for the Law Exporter" + "\n");
            bw.write("     -->" + "\n");
            bw.write("</fieldMap>" + "\n");
            bw.close();

        } catch (IOException e) {
            System.out.println("There was an error writing to the xml file." + "\n");
        }

    }

    private static String generate_facet_value(String val) throws IOException {
        String value = "";
        if (!val.endsWith("*")) {
            value = val;
        }
        if (val.endsWith("*")) {
            value = val.substring(0, val.length() - 1);
        }
        return value;
    }

    private static void Build_json_file(String config_dir, long startTime) throws IOException {
        File f = new File(output_dir + "/" + Default_DataSet_name);
        BufferedWriter wr = new BufferedWriter(new FileWriter(f));

        wr.write("{\"docs\":[");
        wr.write("\n");

        ObjectMapper objectMapper = new ObjectMapper();

        //////////////////////////////////////////////////// flow control: (for loop) for number of
        //////////////////////////////////////////////////// documents and (while) for each field in document
        for (int i = 0; i <= documents_count; i++) {

            fields = Parse_Document_fields(config_dir);
            Iterator iterator = fields.keySet().iterator();
            while (iterator.hasNext()) {

                String key = (String) iterator.next();
                String v = (String) fields.get(key);
                String value = generate_facet_value(v);
                if (value.startsWith("integer.key")) {
                    integer_key(fields, key, value);
                }
                if (value.startsWith("seq.integer")) {
                    seq_integer(fields, key, value);
                }

                if (value.startsWith("range")) {
                    range_integer(fields, key, value);
                }
                if (value.charAt(0) == '[') {
                    single_enum(fields, key, value);
                }
                if (value.startsWith("multi")) {
                    multi_enum_json(fields, key, value);
                }
                if (value.startsWith("date")) {
                    generate_date(fields, key, value);
                }

                if (value.equals("text.key")) {
                    generate_Text_json(fields, key);
                }

                if (value.equals("text")) {
                    generate_Text_json(fields, key);
                }

                if (value.startsWith("(")) {
                    String VALUE = value.substring(1, value.length() - 1);
                    fields.put(key, VALUE);
                }
            }

            objectMapper.configure(SerializationFeature.ORDER_MAP_ENTRIES_BY_KEYS, false);

            String s = objectMapper.writeValueAsString(fields);
            wr.write(s);
            wr.write(",\n");

            fields.clear();

            if (i == count_check) {
                System.out.println("Number of Documents created: " + count_check);
                System.out.println("Reading from file: (" + listOfFiles[file_index] + ")");
                System.out.println("Size of all documents so far: (" + total + ") Bytes");
                System.out.println("\n");

                count_check += 1000;
            }
            cnt = i;
        }

        System.out.println("Total Number of Documents created: " + cnt);
        System.out.println("Total size of Dataset created: " + total);

        wr.write("]}");
        wr.flush();

        wr.close();
        long endTime = System.nanoTime();
        long duration = endTime - startTime;
        System.out.println("Total execuion time: " + (double) duration / 1000000000.0 + " Seconds" + "\n");
    }

    private static void Build_dat_file(String config_dir, long startTime) throws IOException {

        String header = "";
        File f = new File(output_dir + "/" + Default_DataSet_name);
        Writer wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "ISO-8859-1"));

        fields = Parse_Document_fields(config_dir);
        Iterator iterator1 = fields.keySet().iterator();
        while (iterator1.hasNext()) {
            String Key = (String) iterator1.next();
            header += '\u00FE' + Key + '\u00FE' + '\u0014';
        }

        wr.write(header.trim() + (char) 13 + (char) 10);
        //////////////////////////////////////////////////// flow control: (for loop) for number of
        //////////////////////////////////////////////////// documents and (while) for each field in document
        for (int i = 0; i <= documents_count; i++) {
            String Doc = "";
            fields = Parse_Document_fields(config_dir);
            Iterator iterator = fields.keySet().iterator();
            while (iterator.hasNext()) {

                String key = (String) iterator.next();
                String v = (String) fields.get(key);
                String value = generate_facet_value(v);
                if (value.startsWith("integer.key")) {
                    integer_key(fields, key, value);
                    String temp = (String) fields.get(key);
                    Doc += (char) 254 + temp + (char) 254 + '\u0014';
                }
                if (value.startsWith("seq.integer")) {
                    seq_integer(fields, key, value);
                    String temp = (String) fields.get(key);
                    Doc += (char) 254 + temp + (char) 254 + '\u0014';
                }

                if (value.startsWith("range")) {
                    range_integer(fields, key, value);
                    String temp = (String) fields.get(key);
                    Doc += (char) 254 + temp + (char) 254 + '\u0014';
                }
                if (value.charAt(0) == '[') {
                    single_enum(fields, key, value);
                    String temp = (String) fields.get(key);
                    Doc += (char) 254 + temp + (char) 254 + '\u0014';
                }
                if (value.startsWith("multi")) {
                    multi_enum_dat(fields, key, value);
                    String temp = (String) (fields.get(key));
                    Doc += (char) 254 + temp + (char) 254 + '\u0014';
                }
                if (value.startsWith("date")) {
                    generate_date(fields, key, value);
                    String temp = (String) fields.get(key);
                    Doc += (char) 254 + temp + (char) 254 + '\u0014';
                }

                if (value.equals("text.key")) {
                    generate_Text_dat(fields, key);
                    String temp = (String) fields.get(key);
                    Doc += (char) 254 + temp + (char) 254 + '\u0014';
                }

                if (value.equals("text")) {
                    generate_Text_dat(fields, key);
                    String temp = (String) fields.get(key);
                    Doc += (char) 254 + temp + (char) 254 + '\u0014';
                }

                if (value.startsWith("(")) {
                    String VALUE = value.substring(1, value.length() - 1);
                    fields.put(key, VALUE);
                    String temp = (String) fields.get(key);
                    Doc += (char) 254 + temp + (char) 254 + '\u0014';
                }
            }

            wr.write(Doc.trim() + (char) 13 + (char) 10);

            fields.clear();

            if (i == count_check) {
                System.out.println("Number of Documents created: " + count_check);
                System.out.println("Reading from file: (" + listOfFiles[file_index] + ")");
                System.out.println("Size of all documents so far: (" + total + ") Bytes");
                System.out.println("\n");

                count_check += 1000;
            }
            cnt = i;
        }

        System.out.println("Total Number of Documents created: " + cnt);
        System.out.println("Total size of Dataset created: " + total);

        wr.close();
        long endTime = System.nanoTime();
        long duration = endTime - startTime;
        System.out.println("Total execuion time: " + (double) duration / 1000000000.0 + " Seconds" + "\n");
    }
}