Java tutorial
/* * Copyright 2013-2017 Indiana University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.iu.subgraph; import org.apache.log4j.Logger; import org.apache.commons.lang3.ArrayUtils; import java.util.Arrays; import java.util.Random; import edu.iu.harp.example.IntArrPlus; import edu.iu.harp.example.DoubleArrPlus; import edu.iu.harp.partition.Partition; import edu.iu.harp.partition.Table; import edu.iu.harp.resource.IntArray; import edu.iu.harp.resource.DoubleArray; import edu.iu.harp.resource.LongArray; import edu.iu.harp.io.ConnPool; import edu.iu.harp.resource.ResourcePool; import it.unimi.dsi.fastutil.ints.Int2ObjectMap; import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; import it.unimi.dsi.fastutil.ints.IntArrays; import it.unimi.dsi.fastutil.ints.IntAVLTreeSet; import it.unimi.dsi.fastutil.objects.ObjectIterator; import org.apache.hadoop.mapreduce.Mapper.Context; // using HJlib import static edu.rice.hj.Module0.launchHabaneroApp; import static edu.rice.hj.Module1.forallChunked; //for BSP-LRT import java.util.*; import java.util.BitSet; import java.util.concurrent.BrokenBarrierException; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; //threads affinity import net.openhft.affinity.Affinity; // for triggering vtune import java.nio.file.*; import java.io.IOException; // for hadoop assert import static org.junit.Assert.*; /** * @brief A colorcount method that implements * a BSP-LRT style */ public class colorcount_HJ { // ----------------------------- for graph ----------------------------- //local graph Graph g; // vertice number of local graph int num_verts_graph; //color val for each vertex on local graph int[] colors_g; // max vertice abs id of global graph private int max_abs_id; // ----------------------------- for templates ----------------------------- //global template Graph t; //equals to subtemplate size int num_colors; //partitioning a template into a chain of sub templates partitioner part; //partitioned subtemplates Graph[] subtemplates; // number of subtempaltes int subtemplate_count; // vertices num of each subtemplate int[] num_verts_table; // vertice num of current subtemplate private int num_verts_sub_ato; // num of color comb num for current subtemplate private int num_combinations_ato; // ----------------------------- for dynamic programming ----------------------------- //store counts in dynamic programming table dynamic_table_array dt; // current active and passive template id for debug use private int active_child; private int passive_child; //hold all the nonzero position in updating //comm data in passive child of each subtemplate // private int[][] comm_nonzero_index; // ----------------------------- for combination colorset index ----------------------------- //stores the combination of color sets int[][] choose_table; // temp index sets to construct comb_num_indexes int[][][][] index_sets; // temp index sets to construct comb_num_indexes int[][][][][] color_sets; // record comb num values for active and passive children int[][][][] comb_num_indexes; // record comb num values for each subtemplate and each color combination int[][] comb_num_indexes_set; // ----------------------------- for multi-threading counting ----------------------------- //thread barrier private CyclicBarrier barrier; // for mutual locking private ReentrantLock lock; private int thread_num = 24; private int core_num = 24; // physical threads per core private int tpc = 2; private String affinity = "compact"; // chunks of local vertices of graph for each thread private int[] chunks; private int[] chunks_pipeline; // chunks of mappers for each thread private int[] chunks_mapper; private double[] cc_ato; //store the local counts for last template private double[] count_local_root; //store the local counts for last template private double[] count_comm_root; // total counts on local graph private double full_count_ato = 0; // cumulated total counts from all iterations private double cumulate_count_ato = 0; // ----------------------------- for communication ----------------------------- //harp communicator private SCCollectiveMapper mapper; private int mapper_num = 1; private int local_mapper_id; // adding adj_abs_v to each mapper private Set<Integer>[] comm_mapper_vertex; // private Table<IntArray> recv_vertex_table; // harp table that stores vertex sent to other mappers private Table<IntArray> send_vertex_table; // harp table that regroups send/recv abs_v_id private Table<IntArray> comm_vertex_table; // harp table that regroups send/recv color counts private Table<SCSet> comm_data_table; //comm adj_abs_id for each local rel_v_id private int[][] update_map; // the size of comm adj_abs_id for each local rel_v_id private int[] update_map_size; // mapping an arbitary abs_v_id to its belonged mapper id private int[] abs_v_to_mapper; // mapping an arbitary abs_v_id to the relative position in updating queues private int[] abs_v_to_queue; private int[] update_mapper_len; // start/end counts pos of each adj_v_id counts on each mapper private int[][][] update_queue_pos; // adj_v_id counts on each mapper private float[][][] update_queue_counts; private short[][][] update_queue_index; //cache the compressed sending data private float[][] compress_cache_array; private short[][] compress_cache_index; private int[] compress_cache_len; // cache the rel pos of adj for each local v in pipeline rotation private int[][] map_ids_cache_pip; private int[][] chunk_ids_cache_pip; private int[][] chunk_internal_offsets_cache_pip; // by default send array size 250MB private long send_array_limit = 250L * 1024L * 1024L; private boolean rotation_pipeline = true; private int pipeline_update_id = 0; private int pipeline_send_id = 0; private int pipeline_recv_id = 0; // ---------------------------- for label and other func ---------------------------- int[] labels_g; int[] labels_t; boolean labeled; double[] final_vert_counts; boolean do_graphlet_freq = false; boolean do_vert_output = false; // -------------------------------- Misc -------------------------------- // record computation time private long start_comp = 0; // avg computation time of all iterations private long time_comp = 0; //global comm time, excluding the waiting time on local //computation private long start_comm = 0; private long time_comm = 0; //local sync time, including comm time and waiting time because of //load imbalance private long start_sync = 0; private long time_sync = 0; private long start_misc = 0; private Context context; //number of iteration int num_iter; int cur_iter; final Logger LOG = Logger.getLogger(colorcount_HJ.class); boolean verbose = false; /** * @brief initialize local graph * * @param local_graph * @param global_max_v_id * @param thread_num * @param core_num * @param affinity * @param calc_auto * @param do_gdd * @param do_vert * @param verb */ void init(SCCollectiveMapper mapper, Context context, Graph local_graph, int global_max_v_id, int thread_num, int core_num, int tpc, String affinity, boolean do_gdd, boolean do_vert, boolean verb) { // assign params this.mapper = mapper; this.context = context; this.g = local_graph; this.max_abs_id = global_max_v_id; this.thread_num = thread_num; this.core_num = core_num; this.tpc = tpc; this.affinity = affinity; this.do_graphlet_freq = do_gdd; this.do_vert_output = do_vert; this.verbose = verb; // init members this.labels_g = this.g.labels; this.labeled = this.g.labeled; this.num_verts_graph = this.g.num_vertices(); this.colors_g = new int[this.num_verts_graph]; this.cc_ato = new double[this.thread_num]; this.count_local_root = new double[this.thread_num]; this.count_comm_root = new double[this.thread_num]; this.dt = new dynamic_table_array(); this.barrier = new CyclicBarrier(this.thread_num); if (do_graphlet_freq || do_vert_output) { //ToDo for graphlet freq and vert output } } /** * @brief init members related to communications * * @param mapper_id_vertex * @param mapper */ void init_comm(int[] mapper_id_vertex, long send_array_limit, boolean rotation_pipeline) { //create abs mapping structure this.abs_v_to_mapper = mapper_id_vertex; this.send_array_limit = send_array_limit; this.rotation_pipeline = rotation_pipeline; // init members this.mapper_num = this.mapper.getNumWorkers(); this.local_mapper_id = this.mapper.getSelfID(); this.abs_v_to_queue = new int[this.max_abs_id + 1]; // this.chunks_mapper = divide_chunks(this.mapper_num, this.thread_num); // init comm mappers this.comm_mapper_vertex = new HashSet[this.mapper_num]; for (int i = 0; i < this.mapper_num; i++) this.comm_mapper_vertex[i] = new HashSet<>(); //loop over all the adj of local graph verts //adj_array stores abs id int[] adj_array = this.g.adjacencies(); for (int i = 0; i < adj_array.length; i++) { int adj_abs_id = adj_array[i]; int mapper_id = this.abs_v_to_mapper[adj_abs_id]; this.comm_mapper_vertex[mapper_id].add(new Integer(adj_abs_id)); } this.update_map = new int[this.num_verts_graph][]; for (int i = 0; i < this.num_verts_graph; i++) this.update_map[i] = new int[this.g.out_degree(i)]; this.update_map_size = new int[this.num_verts_graph]; this.update_queue_pos = new int[this.mapper_num][][]; this.update_queue_counts = new float[this.mapper_num][][]; this.update_queue_index = new short[this.mapper_num][][]; this.update_mapper_len = new int[this.mapper_num]; //convert set to arraylist this.comm_vertex_table = new Table<>(0, new IntArrPlus()); this.send_vertex_table = new Table<>(0, new IntArrPlus()); this.compress_cache_array = new float[this.num_verts_graph][]; this.compress_cache_index = new short[this.num_verts_graph][]; this.compress_cache_len = new int[this.num_verts_graph]; this.map_ids_cache_pip = new int[this.num_verts_graph][]; this.chunk_ids_cache_pip = new int[this.num_verts_graph][]; this.chunk_internal_offsets_cache_pip = new int[this.num_verts_graph][]; // prepareing send/recv information // the requested adj_abs_v queues will be sent to each mapper for (int i = 0; i < this.mapper_num; i++) { // i is the mapper id from which local_mapper demands adj data if ((i != this.local_mapper_id) && this.comm_mapper_vertex[i].size() > 0) { //create partition id, the upper 16 bits stores requested id //the lower 16 bits stores sender id (local id) int comm_id = ((i << 16) | this.local_mapper_id); // retrieve communicated adj_abs_id ArrayList<Integer> temp_array = new ArrayList<>(comm_mapper_vertex[i]); int[] temp_array_primitive = ArrayUtils .toPrimitive(temp_array.toArray(new Integer[temp_array.size()])); // recored the length of total vertices array received from other mappers this.update_mapper_len[i] = temp_array_primitive.length; //create mapping from adj_abs_id to relative t in update offset queue for (int j = 0; j < temp_array_primitive.length; j++) this.abs_v_to_queue[temp_array_primitive[j]] = j; IntArray comm_array = new IntArray(temp_array_primitive, 0, temp_array_primitive.length); //table to communicate with other mappers this.comm_vertex_table.addPartition(new Partition<>(comm_id, comm_array)); } } if (this.verbose) LOG.info("Start regroup comm_vertex_table"); this.mapper.regroup("sc", "regroup-send-recv-vertex", this.comm_vertex_table, new SCPartitioner(this.mapper_num)); // the name of barrier should not be the same as that of regroup // otherwise may cause a deadlock this.mapper.barrier("sc", "regroup-send-recv-sync"); if (this.verbose) LOG.info("Finish regroup comm_vertex_table"); // pack the received requested adj_v_id information into send queues for (int comm_id : this.comm_vertex_table.getPartitionIDs()) { int dst_mapper_id = comm_id & ((1 << 16) - 1); // this shall equal to local_mapper_id int src_mapper_id = comm_id >>> 16; //create send table send_vertex_table .addPartition(new Partition(dst_mapper_id, comm_vertex_table.getPartition(comm_id).get())); if (this.verbose) { //check src id add assertion assertEquals("comm_vertex_table sender not matched ", this.local_mapper_id, src_mapper_id); LOG.info("Send from mapper: " + src_mapper_id + " to mapper: " + dst_mapper_id + "; partition size: " + comm_vertex_table.getPartition(comm_id).get().get().length); } } //release memory for (int i = 0; i < this.mapper_num; i++) this.comm_mapper_vertex[i] = null; this.comm_mapper_vertex = null; } /** * @brief compute color counting in N iterations * * @param template * @param N * * @return */ public double do_full_count(Graph template, int N) { this.t = template; this.num_iter = N; this.labels_t = t.labels; // --------------------------- creating subtemplates and comb number index system --------------------------- if (this.verbose) { LOG.info("Begining partition..."); } //partition the template into subtemplates this.part = new partitioner(this.t, this.labeled, this.labels_t); this.part.sort_subtemplates(); if (this.verbose) { LOG.info("done partitioning"); } //colors equals the num of vertices this.num_colors = this.t.num_vertices(); //get array of subtemplates this.subtemplates = this.part.get_subtemplates(); //subtemplates num this.subtemplate_count = this.part.get_subtemplate_count(); //obtain the hash values table for each subtemplate and a combination of color sets create_tables(); //initialize dynamic prog table, with subtemplate-vertices-color array this.dt.init(this.subtemplates, this.subtemplate_count, this.num_verts_graph, this.num_colors, this.max_abs_id); //vertice num of the full graph, huge this.chunks = divide_chunks(this.num_verts_graph, this.thread_num); // in pipeline regroup-update, thread 0 is doing communication this.chunks_pipeline = divide_chunks(this.num_verts_graph, this.thread_num - 1); //triggering vtune profiling add command option for vtune // java.nio.file.Path vtune_file = java.nio.file.Paths.get("vtune-flag.txt"); // String flag_trigger = "Start training process and trigger vtune profiling."; // try{ // java.nio.file.Files.write(vtune_file, flag_trigger.getBytes()); // }catch (IOException e) // { // LOG.info("Failed to create vtune trigger flag"); // } if (this.verbose) { LOG.info("Starting Multi-threading Counting Iterations"); } launchHabaneroApp(() -> forallChunked(0, this.thread_num - 1, (threadIdx) -> { //set Java threads affinity BitSet bitSet = new BitSet(this.core_num); int thread_mask = 0; if (this.verbose && threadIdx == 0) { LOG.info("Set up threads affinity: Core Num: " + this.core_num + "; Total Threads: " + this.thread_num + "; thd per core: " + this.tpc + "; affinity: " + this.affinity); } if (this.affinity == "scatter") { //implement threads bind by core round-robin thread_mask = threadIdx % this.core_num; } else { //default affinity compact //implement a compact bind, 2 threads a core int tpn = this.tpc * this.core_num; thread_mask = threadIdx % tpn; thread_mask /= this.tpc; } bitSet.set(thread_mask); Affinity.setAffinity(bitSet); try { // start the main loop of iterations for (int cur_itr = 0; cur_itr < this.num_iter; cur_itr++) { if (threadIdx == 0) this.cur_iter = cur_itr; //start sampling colors this.barrier.await(); if (verbose && threadIdx == 0) { LOG.info("Start Sampling Graph for Itr: " + cur_itr); // this.start_comp = System.currentTimeMillis(); this.start_misc = System.currentTimeMillis(); } Random rand = new Random(System.currentTimeMillis()); //sampling the vertices of full graph g for (int i = this.chunks[threadIdx]; i < this.chunks[threadIdx + 1]; ++i) { this.colors_g[i] = rand.nextInt(this.num_colors); } this.barrier.await(); if (this.verbose && threadIdx == 0) { LOG.info("Finish Sampling Graph for Itr: " + cur_itr + "; use time: " + (System.currentTimeMillis() - this.start_misc) + "ms"); } // start doing counting for (int s = this.subtemplate_count - 1; s > 0; --s) { if (threadIdx == 0) { //get num_vert of subtemplate s this.num_verts_sub_ato = this.num_verts_table[s]; if (this.verbose) LOG.info("Initing Subtemplate " + s + ", t verts: " + num_verts_sub_ato); int a = this.part.get_active_index(s); int p = this.part.get_passive_index(s); if (this.verbose) LOG.info("Subtemplate: " + s + "; active_idx: " + a + "; passive_idx: " + p); this.dt.init_sub(s, a, p); } this.barrier.await(); if (this.verbose && threadIdx == 0) LOG.info("Start Counting Local Graph Subtemplate " + s); //hit the bottom of subtemplate chain, dangling template node if (this.num_verts_sub_ato == 1) { if (s == this.subtemplate_count - 1) { init_table_node_HJ(s, threadIdx, this.chunks); } else { if (threadIdx == 0) dt.set_to_table(this.subtemplate_count - 1, s); } } else { colorful_count_HJ(s, threadIdx, this.chunks); } this.barrier.await(); if (this.verbose && threadIdx == 0) LOG.info("Finish Counting Local Graph Subtemplate " + s); //start communication part single thread // only for subtemplates size > 1, having neighbours on other mappers // only if more than one mapper, otherwise all g verts are local if (this.mapper_num > 1 && this.num_verts_sub_ato > 1) { if (this.rotation_pipeline) regroup_update_pipeline(s, threadIdx); else regroup_update_all(s, threadIdx, this.chunks); } // printout results for sub s this.barrier.await(); if (threadIdx == 0) { if (this.verbose) LOG.info("JVM Memory Used in subtemplate: " + s + " is: " + (Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory())); int a = this.part.get_active_index(s); int p = this.part.get_passive_index(s); //release the template with size > 1 if (a != SCConstants.NULL_VAL) { // do not free the dangling template node if (this.num_verts_table[a] > 1) this.dt.clear_sub(a); } if (p != SCConstants.NULL_VAL) { if (this.num_verts_table[p] > 1) this.dt.clear_sub(p); } } this.barrier.await(); //print out counts for this subtemplate if (this.verbose) { print_counts(s, threadIdx, this.chunks); this.barrier.await(); if (threadIdx == 0) { double sub_total_counts = 0; for (int x = 0; x < this.thread_num; x++) sub_total_counts += cc_ato[x]; LOG.info("Total counts for sub-temp: " + s + " is: " + sub_total_counts); } this.barrier.await(); } if (threadIdx == 0) this.context.progress(); } // end of a subtemplate if (verbose && threadIdx == 0) LOG.info("Done with initialization. Doing full count"); // do the count for the full template if (threadIdx == 0) { this.num_verts_sub_ato = this.num_verts_table[0]; int a = this.part.get_active_index(0); int p = this.part.get_passive_index(0); if (this.verbose) LOG.info("Subtemplate 0 ; active_idx: " + a + "; passive_idx: " + p); dt.init_sub(0, a, p); } this.barrier.await(); colorful_count_HJ(0, threadIdx, chunks); this.barrier.await(); //comm and add the communicated counts to full_count_ato // only for subtemplates size > 1, having neighbours on other mappers // only if more than one mapper, otherwise all g verts are local if (this.num_verts_sub_ato > 1 && this.mapper_num > 1) { if (this.rotation_pipeline) regroup_update_pipeline(0, threadIdx); else regroup_update_all(0, threadIdx, this.chunks); } this.barrier.await(); // printout results for last sub if (threadIdx == 0) { double sum_count = 0.0; for (int k = 0; k < this.thread_num; k++) { sum_count += this.count_local_root[k]; sum_count += this.count_comm_root[k]; } this.full_count_ato = sum_count; LOG.info("Finish update comm counts for last subtemplate: " + "; total counts: " + sum_count); } this.barrier.await(); if (threadIdx == 0) { if (this.verbose) LOG.info("JVM Memory Used in Last subtemplate is: " + (Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory())); int a = this.part.get_active_index(0); int p = this.part.get_passive_index(0); if (a != SCConstants.NULL_VAL) this.dt.clear_sub(a); if (p != SCConstants.NULL_VAL) this.dt.clear_sub(p); //free the first dangling template node this.dt.clear_sub(subtemplate_count - 1); } //add counts from every iteration if (threadIdx == 0) { this.cumulate_count_ato += this.full_count_ato; } // free comm data if (threadIdx == 0) { if (this.mapper_num > 1) { ResourcePool.get().clean(); ConnPool.get().clean(); } System.gc(); this.context.progress(); } } // end of an iteration this.barrier.await(); } catch (InterruptedException | BrokenBarrierException e) { LOG.info("Catch barrier exception in itr: " + this.cur_iter); e.printStackTrace(); } })); //----------------------- end of color_counting ----------------- double final_count = cumulate_count_ato / (double) this.num_iter; //free memory this.send_vertex_table = null; this.comm_vertex_table = null; this.update_map = null; this.colors_g = null; this.compress_cache_array = null; this.compress_cache_index = null; this.map_ids_cache_pip = null; this.chunk_ids_cache_pip = null; this.chunk_internal_offsets_cache_pip = null; delete_tables(); this.part.clear_temparrays(); return final_count; } /** * @brief divide local graph vertices into multi-threading chunks * * @param total * @param partition * * @return */ int[] divide_chunks(int total, int partition) { int chunks[] = new int[partition + 1]; chunks[0] = 0; int remainder = total % partition; int basic_size = total / partition; for (int i = 1; i <= partition; ++i) { chunks[i] = chunks[i - 1] + basic_size + (remainder > 0 ? 1 : 0); --remainder; } return chunks; } /** * @brief divide chunks in preparing sending/receiving packets * * @param total * @param partition * * @return */ int[] divide_chunks_comm(int total, int partition) { int chunks[] = new int[partition + 1]; chunks[0] = 0; int remainder = total % partition; int basic_size = total / partition; for (int i = 1; i <= partition - 1; ++i) { chunks[i] = chunks[i - 1] + basic_size; } // for last chunk chunks[partition] = chunks[partition - 1] + basic_size + remainder; return chunks; } private void init_table_node_HJ(int s, int threadIdx, int[] chunks) throws BrokenBarrierException, InterruptedException { //replace unlabeled implementation with multi-threading if (!labeled) { //initialization the bottom, using relative v_id for (int v = chunks[threadIdx]; v < chunks[threadIdx + 1]; ++v) { //get the randomly assigned color value int n = colors_g[v]; //put counts into curTable for rel v id //template s and color n dt.set(v, comb_num_indexes_set[s][n], 1.0f); } barrier.await(); } else { if (threadIdx == 0) { int set_count_loop = 0; int[] labels_sub = part.get_labels(s); int label_s = labels_sub[0]; for (int v = 0; v < num_verts_graph; ++v) { int n = colors_g[v]; int label_g = labels_g[v]; if (label_g == label_s) { dt.set(v, comb_num_indexes_set[s][n], 1.0f); set_count_loop++; } } } } } /** * @brief counting * * @param s * @param threadIdx * @param chunks * * @return */ private void colorful_count_HJ(int s, int threadIdx, int[] chunks) throws BrokenBarrierException, InterruptedException { if (threadIdx == 0) { //get vert num of subtemplate num_verts_sub_ato = subtemplates[s].num_vertices(); //get vert num of active child int active_index = part.get_active_index(s); int num_verts_a = num_verts_table[active_index]; // colorset combinations from active child num_combinations_ato = this.choose_table[num_verts_sub_ato][num_verts_a]; } //clear counts count_local_root[threadIdx] = 0.0d; barrier.await(); int[] valid_nbrs = new int[g.max_degree()]; assert (valid_nbrs != null); int valid_nbrs_count = 0; // each thread for a chunk // loop by relative v_id for (int v = chunks[threadIdx]; v < chunks[threadIdx + 1]; ++v) { // v is relative v_id from 0 to num_verts -1 valid_nbrs_count = 0; // check vertex v is initialized in active child table // all the local verts already initialized at the bottom of subtemplate chain if (dt.is_vertex_init_active(v)) { //adjs is absolute v_id int[] adjs_abs = g.adjacent_vertices(v); int end = g.out_degree(v); //indexed by comb number //counts of v at active child float[] counts_a = dt.get_active(v); //loop overall its neighbours int nbr_comm_itr = 0; for (int i = 0; i < end; ++i) { int adj_i = g.get_relative_v_id(adjs_abs[i]); //how to determine whether adj_i is in the current passive table if (adj_i >= 0 && dt.is_vertex_init_passive(adj_i)) { valid_nbrs[valid_nbrs_count++] = adj_i; } if (this.mapper_num > 1 && adj_i < 0) this.update_map[v][nbr_comm_itr++] = adjs_abs[i]; } if (this.mapper_num > 1) this.update_map_size[v] = nbr_comm_itr; if (valid_nbrs_count != 0) { //add counts on local nbs //number of colors for template s //different from num_combinations_ato, which is for active child int num_combinations_verts_sub = this.choose_table[num_colors][num_verts_sub_ato]; // for a specific vertex v initialized on active child // first loop on different color_combs of cur subtemplate for (int n = 0; n < num_combinations_verts_sub; ++n) { double color_count = 0.0f; // more details int[] comb_indexes_a = comb_num_indexes[0][s][n]; int[] comb_indexes_p = comb_num_indexes[1][s][n]; int p = num_combinations_ato - 1; // second loop on different color_combs of active/passive children // a+p == num_combinations_ato // (total colorscombs for both of active child and pasive child) for (int a = 0; a < num_combinations_ato; ++a, --p) { float count_a = counts_a[comb_indexes_a[a]]; if (count_a > 0) { //third loop on different valid nbrs for (int i = 0; i < valid_nbrs_count; ++i) { //validated nbrs already checked to be on passive child color_count += (double) count_a * dt.get_passive(valid_nbrs[i], comb_indexes_p[p]); } } } if (color_count > 0.0) { // if(do_graphlet_freq || do_vert_output) // final_vert_counts[v] += (double)color_count; if (s != 0) dt.set(v, comb_num_indexes_set[s][n], (float) color_count); else { //last template store counts into count_local_root count_local_root[threadIdx] += color_count; } } } } } } valid_nbrs = null; barrier.await(); } /** * @brief obtain the hash values table for each subtemplate and a combination of color sets * * @return */ private void create_tables() { //C_n^k compute the unique number of color set choices //two dimensional this.choose_table = Util.init_choose_table(this.num_colors); //record vertices number of each subtemplate create_num_verts_table(); //create color index for each combination of different set size create_all_index_sets(); //create color sets for all subtemplates create_all_color_sets(); //convert colorset combination into numeric values (hash function) create_comb_num_system_indexes(); //free up memory space delete_all_color_sets(); //free up memory space delete_all_index_sets(); } private void delete_tables() { for (int i = 0; i <= num_colors; ++i) this.choose_table[i] = null; this.choose_table = null; delete_comb_num_system_indexes(); num_verts_table = null; } /** * @brief record vertices number of each subtemplate * * @return */ private void create_num_verts_table() { this.num_verts_table = new int[this.subtemplate_count]; for (int s = 0; s < this.subtemplate_count; ++s) { this.num_verts_table[s] = this.subtemplates[s].num_vertices(); } } /** * @brief create color index for each combination of different set size * * @return */ private void create_all_index_sets() { //first dim (up to) how many colors this.index_sets = new int[this.num_colors][][][]; for (int i = 0; i < (this.num_colors - 1); ++i) { int num_vals = i + 2; this.index_sets[i] = new int[num_vals - 1][][]; // second dim, for up to num_vals colors, has different set sizes for (int j = 0; j < (num_vals - 1); ++j) { int set_size = j + 1; int num_combinations = Util.choose(num_vals, set_size); // third dim, for a set size, how many combinations from a given num of colors this.index_sets[i][j] = new int[num_combinations][]; //set start from 1 to set_size //init set in increase order int[] set = Util.init_permutation(set_size); for (int k = 0; k < num_combinations; ++k) { // fourth dim, for a combination, having a set_size of color values this.index_sets[i][j][k] = new int[set_size]; for (int p = 0; p < set_size; ++p) { this.index_sets[i][j][k][p] = set[p] - 1; } // permutate the color set Util.next_set(set, set_size, num_vals); } set = null; } } } /** * @brief create color sets for all subtemplates * * @return */ private void create_all_color_sets() { //first dim, num of subtemplates this.color_sets = new int[this.subtemplate_count][][][][]; for (int s = 0; s < this.subtemplate_count; ++s) { int num_verts_sub = this.subtemplates[s].num_vertices(); if (num_verts_sub > 1) { //determine how many sets in a subtemplate //choose num vertices of subtemplate from colors int num_sets = Util.choose(this.num_colors, num_verts_sub); //second dim, num of sets in a subtemplate this.color_sets[s] = new int[num_sets][][][]; //init permutation in colorset int[] colorset = Util.init_permutation(num_verts_sub); for (int n = 0; n < num_sets; ++n) { int num_child_combs = num_verts_sub - 1; //third dim, for a subtemplate, a set, how many child combinations this.color_sets[s][n] = new int[num_child_combs][][]; for (int c = 0; c < num_child_combs; ++c) { int num_verts_1 = c + 1; int num_verts_2 = num_verts_sub - num_verts_1; int[][] index_set_1 = this.index_sets[num_verts_sub - 2][num_verts_1 - 1]; int[][] index_set_2 = this.index_sets[num_verts_sub - 2][num_verts_2 - 1]; int num_child_sets = Util.choose(num_verts_sub, c + 1); this.color_sets[s][n][c] = new int[num_child_sets][]; for (int i = 0; i < num_child_sets; ++i) { this.color_sets[s][n][c][i] = new int[num_verts_sub]; for (int j = 0; j < num_verts_1; ++j) this.color_sets[s][n][c][i][j] = colorset[index_set_1[i][j]]; for (int j = 0; j < num_verts_2; ++j) this.color_sets[s][n][c][i][j + num_verts_1] = colorset[index_set_2[i][j]]; } } Util.next_set(colorset, num_verts_sub, this.num_colors); } colorset = null; } } } /** * @brief convert colorset combination into numeric values (hash function) * * @return */ private void create_comb_num_system_indexes() { this.comb_num_indexes = new int[2][this.subtemplate_count][][]; this.comb_num_indexes_set = new int[this.subtemplate_count][]; // each subtemplate for (int s = 0; s < this.subtemplate_count; ++s) { int num_verts_sub = this.subtemplates[s].num_vertices(); int num_combinations_s = Util.choose(this.num_colors, num_verts_sub); if (num_verts_sub > 1) { //for active and passive children this.comb_num_indexes[0][s] = new int[num_combinations_s][]; this.comb_num_indexes[1][s] = new int[num_combinations_s][]; } this.comb_num_indexes_set[s] = new int[num_combinations_s]; int[] colorset_set = Util.init_permutation(num_verts_sub); //loop over each combination instance for (int n = 0; n < num_combinations_s; ++n) { //get the hash value for a colorset instance //Util.get_color_index the impl of hash function this.comb_num_indexes_set[s][n] = Util.get_color_index(colorset_set, num_verts_sub); if (num_verts_sub > 1) { int num_verts_a = this.part.get_num_verts_active(s); int num_verts_p = this.part.get_num_verts_passive(s); int[] colors_a; int[] colors_p; int[][] colorsets = this.color_sets[s][n][num_verts_a - 1]; int num_combinations_a = Util.choose(num_verts_sub, num_verts_a); comb_num_indexes[0][s][n] = new int[num_combinations_a]; comb_num_indexes[1][s][n] = new int[num_combinations_a]; int p = num_combinations_a - 1; for (int a = 0; a < num_combinations_a; ++a, --p) { colors_a = colorsets[a]; //are they the same? // colors_p = colorsets[p] + num_verts_a; colors_p = new int[num_verts_p]; System.arraycopy(colorsets[p], num_verts_a, colors_p, 0, num_verts_p); int color_index_a = Util.get_color_index(colors_a, num_verts_a); int color_index_p = Util.get_color_index(colors_p, num_verts_p); this.comb_num_indexes[0][s][n][a] = color_index_a; this.comb_num_indexes[1][s][n][p] = color_index_p; } } //permutate the colorset_set Util.next_set(colorset_set, num_verts_sub, num_colors); } colorset_set = null; } } private void delete_comb_num_system_indexes() { for (int s = 0; s < subtemplate_count; ++s) { int num_verts_sub = subtemplates[s].num_vertices(); int num_combinations_s = Util.choose(num_colors, num_verts_sub); for (int n = 0; n < num_combinations_s; ++n) { if (num_verts_sub > 1) { comb_num_indexes[0][s][n] = null; comb_num_indexes[1][s][n] = null; } } if (num_verts_sub > 1) { comb_num_indexes[0][s] = null; comb_num_indexes[1][s] = null; } comb_num_indexes_set[s] = null; } comb_num_indexes[0] = null; comb_num_indexes[1] = null; comb_num_indexes = null; comb_num_indexes_set = null; } /** * @brief free up memory space * * @return */ private void delete_all_color_sets() { for (int s = 0; s < this.subtemplate_count; ++s) { int num_verts_sub = this.subtemplates[s].num_vertices(); if (num_verts_sub > 1) { int num_sets = Util.choose(this.num_colors, num_verts_sub); for (int n = 0; n < num_sets; ++n) { int num_child_combs = num_verts_sub - 1; for (int c = 0; c < num_child_combs; ++c) { int num_child_sets = Util.choose(num_verts_sub, c + 1); for (int i = 0; i < num_child_sets; ++i) { this.color_sets[s][n][c][i] = null; } this.color_sets[s][n][c] = null; } this.color_sets[s][n] = null; } this.color_sets[s] = null; } } this.color_sets = null; } /** * @brief free up memory space * * @return */ private void delete_all_index_sets() { for (int i = 0; i < (this.num_colors - 1); ++i) { int num_vals = i + 2; for (int j = 0; j < (num_vals - 1); ++j) { int set_size = j + 1; int num_combinations = Util.choose(num_vals, set_size); for (int k = 0; k < num_combinations; ++k) { this.index_sets[i][j][k] = null; } this.index_sets[i][j] = null; } this.index_sets[i] = null; } this.index_sets = null; } /** * @brief divided comm_vert_list on each node into small parcels of vertices * partition num within comm_data_table is far more than the num of mappers * break int[] comm_vert_list into segments, each segment is a partition * chunk size is the total num of partitions divided by threads num * * @param sub_id * * @return */ private void regroup_comm_all(int sub_id) { if (this.verbose) LOG.info("Start prepare comm for subtemplate: " + sub_id); this.start_sync = System.currentTimeMillis(); this.start_misc = System.currentTimeMillis(); //prepare the sending partitions this.comm_data_table = new Table<>(0, new SCSetCombiner()); int comb_len = this.dt.get_num_color_set(this.part.get_passive_index(sub_id)); //prevent the single comm_data array exceeds the limitation of heap allocation 8GB for (int send_id : this.send_vertex_table.getPartitionIDs()) { int[] comm_vert_list = this.send_vertex_table.getPartition(send_id).get().get(); long send_divid_num = (comm_vert_list.length * ((long) comb_len) + this.send_array_limit - 1) / this.send_array_limit; if (this.verbose) { LOG.info("Send id: " + send_id + "; send_divid_num: " + send_divid_num + "; comb len: " + comb_len + "; Total vertices num: " + comm_vert_list.length + "; mul: " + comm_vert_list.length * (long) comb_len); } // send_chunks.length == send_divid_num + 1 int[] send_chunks = divide_chunks_comm(comm_vert_list.length, (int) send_divid_num); // store send_chunks for (int j = 0; j < send_divid_num; j++) { int chunk_len = send_chunks[j + 1] - send_chunks[j]; if (this.verbose) LOG.info("Chunk id: " + j + "; chunk size: " + chunk_len); int[] send_chunk_list = new int[chunk_len]; System.arraycopy(comm_vert_list, send_chunks[j], send_chunk_list, 0, chunk_len); //comm_id (32 bits) consists of three parts: 1) send_id (12 bits); 2) local mapper_id (12 bits) 3) array_parcel id (8 bits) int comm_id = ((send_id << 20) | (this.local_mapper_id << 8) | j); SCSet comm_data = compress_send_data(send_chunk_list, comb_len); this.comm_data_table.addPartition(new Partition<>(comm_id, comm_data)); } } if (this.verbose) { LOG.info("Finish prepare comm for subtemplate: " + sub_id + "; use time: " + (System.currentTimeMillis() - this.start_misc) + "ms"); } //start the regroup communication if (this.verbose) LOG.info("Start regroup comm for subtemplate: " + sub_id); this.start_misc = System.currentTimeMillis(); this.mapper.regroup("sc", "regroup counts data", this.comm_data_table, new SCPartitioner2(this.mapper_num)); this.mapper.barrier("sc", "all regroup sync"); if (this.verbose) { LOG.info("Finish regroup comm for subtemplate: " + sub_id + "; use time: " + (System.currentTimeMillis() - this.start_misc) + "ms"); } //update local g counts by adj from each other mapper for (int comm_id : this.comm_data_table.getPartitionIDs()) { int update_id_tmp = (comm_id & ((1 << 20) - 1)); int update_id = (update_id_tmp >>> 8); int chunk_id = (update_id_tmp & ((1 << 8) - 1)); // create update_queue if (this.update_queue_pos[update_id] == null) { long recv_divid_num = (this.update_mapper_len[update_id] * ((long) comb_len) + this.send_array_limit - 1) / this.send_array_limit; this.update_queue_pos[update_id] = new int[(int) recv_divid_num][]; this.update_queue_counts[update_id] = new float[(int) recv_divid_num][]; this.update_queue_index[update_id] = new short[(int) recv_divid_num][]; } if (this.verbose) { LOG.info("Local Mapper: " + this.local_mapper_id + " recv from remote mapper id: " + update_id + " chunk id: " + chunk_id); } // update vert list accounts for the adj vert may be used to update local v SCSet scset = this.comm_data_table.getPartition(comm_id).get(); this.update_queue_pos[update_id][chunk_id] = scset.get_v_offset(); this.update_queue_counts[update_id][chunk_id] = scset.get_counts_data(); this.update_queue_index[update_id][chunk_id] = scset.get_counts_index(); } // all reduce to get the miminal sync time from all the mappers, set that to the comm time long cur_sync_time = (System.currentTimeMillis() - this.start_sync); Table<LongArray> comm_time_table = new Table<>(0, new LongArrMin()); LongArray comm_time_array = LongArray.create(1, false); comm_time_array.get()[0] = cur_sync_time; comm_time_table.addPartition(new Partition<>(0, comm_time_array)); this.mapper.allreduce("sc", "get-global-comm-time", comm_time_table); this.time_comm += (comm_time_table.getPartition(0).get().get()[0]); // get the local sync time this.time_sync += (cur_sync_time - comm_time_table.getPartition(0).get().get()[0]); comm_time_array = null; comm_time_table = null; if (this.mapper_num > 1) { ResourcePool.get().clean(); ConnPool.get().clean(); } System.gc(); } /** * @brief used in pipelined ring-regroup * operation * * @param sub_id * @param send_id * * @return */ private int regroup_comm_atomic(int sub_id, int send_id) { int update_id_pipeline = 0; if (this.verbose) LOG.info("Pipeline Start prepare comm for subtemplate: " + sub_id + "; send to mapper: " + send_id); this.start_sync = System.currentTimeMillis(); this.start_misc = System.currentTimeMillis(); //prepare the sending partitions this.comm_data_table = new Table<>(0, new SCSetCombiner()); int comb_len = this.dt.get_num_color_set(this.part.get_passive_index(sub_id)); //prevent the single comm_data array exceeds the limitation of heap allocation 8GB int[] comm_vert_list = this.send_vertex_table.getPartition(send_id).get().get(); if (comm_vert_list != null) { long send_divid_num = (comm_vert_list.length * ((long) comb_len) + this.send_array_limit - 1) / this.send_array_limit; if (this.verbose) { LOG.info("Pipeline Send id: " + send_id + "; send_divid_num: " + send_divid_num + "; comb len: " + comb_len + "; Total vertices num: " + comm_vert_list.length + "; mul: " + comm_vert_list.length * (long) comb_len); } // send_chunks.length == send_divid_num + 1 int[] send_chunks = divide_chunks_comm(comm_vert_list.length, (int) send_divid_num); // store send_chunks for (int j = 0; j < send_divid_num; j++) { int chunk_len = send_chunks[j + 1] - send_chunks[j]; if (this.verbose) LOG.info("Chunk id: " + j + "; chunk size: " + chunk_len); int[] send_chunk_list = new int[chunk_len]; System.arraycopy(comm_vert_list, send_chunks[j], send_chunk_list, 0, chunk_len); //comm_id (32 bits) consists of three parts: 1) send_id (12 bits); 2) local mapper_id (12 bits) 3) array_parcel id (8 bits) int comm_id = ((send_id << 20) | (this.local_mapper_id << 8) | j); SCSet comm_data = compress_send_data(send_chunk_list, comb_len); // SCSet comm_data = compress_send_data_cached(send_chunk_list, comb_len); this.comm_data_table.addPartition(new Partition<>(comm_id, comm_data)); } } if (this.verbose) { LOG.info("Pipeline Finish prepare comm for subtemplate: " + sub_id + "; send to mapper: " + send_id + "; use time: " + (System.currentTimeMillis() - this.start_misc) + "ms"); } //start the regroup communication if (this.verbose) LOG.info("Pipeline Start regroup comm for subtemplate: " + sub_id); this.start_misc = System.currentTimeMillis(); if (this.verbose) LOG.info("Pipeline table partitions: " + this.comm_data_table.getNumPartitions()); this.mapper.regroup("sc", "regroup counts data", this.comm_data_table, new SCPartitioner2(this.mapper_num)); this.mapper.barrier("sc", "atomic regroup sync"); if (this.verbose) { LOG.info("Pipeline Finish regroup comm for subtemplate: " + sub_id + "; use time: " + (System.currentTimeMillis() - this.start_misc) + "ms"); } for (int comm_id : this.comm_data_table.getPartitionIDs()) { int update_id_tmp = (comm_id & ((1 << 20) - 1)); int update_id = (update_id_tmp >>> 8); int chunk_id = (update_id_tmp & ((1 << 8) - 1)); // create update_queue if (this.update_queue_pos[update_id] == null) { long recv_divid_num = (this.update_mapper_len[update_id] * ((long) comb_len) + this.send_array_limit - 1) / this.send_array_limit; this.update_queue_pos[update_id] = new int[(int) recv_divid_num][]; this.update_queue_counts[update_id] = new float[(int) recv_divid_num][]; this.update_queue_index[update_id] = new short[(int) recv_divid_num][]; } if (this.verbose) { LOG.info("Pipeline Local Mapper: " + this.local_mapper_id + " recv from remote mapper id: " + update_id + " chunk id: " + chunk_id); } // update vert list accounts for the adj vert may be used to update local v SCSet scset = this.comm_data_table.getPartition(comm_id).get(); this.update_queue_pos[update_id][chunk_id] = scset.get_v_offset(); this.update_queue_counts[update_id][chunk_id] = scset.get_counts_data(); this.update_queue_index[update_id][chunk_id] = scset.get_counts_index(); // there shall be only one update_id sent from one mapper update_id_pipeline = update_id; } // all reduce to get the miminal sync time from all the mappers, set that to the comm time long cur_sync_time = (System.currentTimeMillis() - this.start_sync); Table<LongArray> comm_time_table = new Table<>(0, new LongArrMin()); LongArray comm_time_array = LongArray.create(1, false); comm_time_array.get()[0] = cur_sync_time; comm_time_table.addPartition(new Partition<>(0, comm_time_array)); this.mapper.allreduce("sc", "get-global-comm-time-atomic", comm_time_table); this.time_comm += (comm_time_table.getPartition(0).get().get()[0]); // get the local sync time this.time_sync += (cur_sync_time - comm_time_table.getPartition(0).get().get()[0]); comm_time_array = null; comm_time_table = null; return update_id_pipeline; } /** * @brief update the received count arrays * in standard regroup version * * @param sub_id * @param threadIdx * @param chunks * * @return */ private void update_comm_all(int sub_id, int threadIdx, int[] chunks) throws BrokenBarrierException, InterruptedException { if (threadIdx == 0) { active_child = part.get_active_index(sub_id); passive_child = part.get_passive_index(sub_id); if (this.verbose) LOG.info("Start updating remote counts on local vertex"); } count_comm_root[threadIdx] = 0.0d; barrier.await(); int num_combinations_verts_sub = this.choose_table[num_colors][num_verts_table[sub_id]]; int active_index = part.get_active_index(sub_id); int num_verts_a = num_verts_table[active_index]; // colorset combinations from active child // combination of colors for active child int num_combinations_active_ato = this.choose_table[num_verts_table[sub_id]][num_verts_a]; // to calculate chunk size int comb_len = this.dt.get_num_color_set(this.part.get_passive_index(sub_id)); if (verbose && threadIdx == 0) { LOG.info(" Comb len: " + comb_len); } // this decompress counts will be reused float[] decompress_counts = new float[comb_len]; // accumulate the updates on a n position, then writes to the dt table double[] update_at_n = new double[num_combinations_verts_sub]; barrier.await(); // start update // first loop over local v // debug int effect_count = 0; for (int v = chunks[threadIdx]; v < chunks[threadIdx + 1]; ++v) { if (dt.is_vertex_init_active(v)) { //clear the update_at_n array for (int x = 0; x < num_combinations_verts_sub; x++) update_at_n[x] = 0.0d; int adj_list_size = this.update_map_size[v]; if (adj_list_size == 0) continue; // store the abs adj id for v int[] adj_list = this.update_map[v]; float[] counts_a = dt.get_active(v); int[] map_ids = this.map_ids_cache_pip[v]; int[] chunk_ids = this.chunk_ids_cache_pip[v]; int[] chunk_internal_offsets = this.chunk_internal_offsets_cache_pip[v]; int compress_interval = 0; //second loop over nbrs, decompress adj from Scset for (int i = 0; i < adj_list_size; i++) { int[] adj_offset_list = this.update_queue_pos[map_ids[i]][chunk_ids[i]]; // get the offset pos int start_pos = adj_offset_list[chunk_internal_offsets[i]]; int end_pos = adj_offset_list[chunk_internal_offsets[i] + 1]; if (start_pos != end_pos) { // the num of compressed counts compress_interval = end_pos - start_pos; // get the compressed counts list, all nonzero elements float[] adj_counts_list = this.update_queue_counts[map_ids[i]][chunk_ids[i]]; short[] adj_index_list = this.update_queue_index[map_ids[i]][chunk_ids[i]]; // ----------- start decompress process ----------- for (int x = 0; x < comb_len; x++) decompress_counts[x] = 0.0f; for (int x = 0; x < compress_interval; x++) decompress_counts[(int) adj_index_list[start_pos + x]] = adj_counts_list[start_pos + x]; // ----------- Finish decompress process ----------- //third loop over comb_num for cur subtemplate for (int n = 0; n < num_combinations_verts_sub; n++) { // more details int[] comb_indexes_a = comb_num_indexes[0][sub_id][n]; int[] comb_indexes_p = comb_num_indexes[1][sub_id][n]; // for passive int p = num_combinations_active_ato - 1; // fourth loop over comb_num for active/passive subtemplates for (int a = 0; a < num_combinations_active_ato; ++a, --p) { float count_a = counts_a[comb_indexes_a[a]]; if (count_a > 0) { update_at_n[n] += ((double) count_a * decompress_counts[comb_indexes_p[p]]); } } } // finish all combination sets for cur template } // finish all nonzero adj } // finish all adj of a v //write upated value for (int n = 0; n < num_combinations_verts_sub; n++) { if (sub_id != 0) dt.update_comm(v, comb_num_indexes_set[sub_id][n], (float) update_at_n[n]); else count_comm_root[threadIdx] += update_at_n[n]; } effect_count++; if (this.verbose && threadIdx == 0 && (effect_count % 1000 == 0)) LOG.info("Thd 0 proceesed: " + effect_count + " vertices"); } // finishe an active v } // finish all the v on thread //debug if (this.verbose && threadIdx == 0) LOG.info("Thd 0 finished all the vertices"); decompress_counts = null; update_at_n = null; barrier.await(); if (verbose && threadIdx == 0) LOG.info("Finish updating remote counts on local vertex"); } /** * @brief Used in a pipeline regroup-update * * @param sub_id * @param threadIdx exclude thread 0 * @param chunks chunks not included thread 0 * * @return */ private void update_comm_atomic(int sub_id, int update_mapper_id, int threadIdx, int[] chunks) throws BrokenBarrierException, InterruptedException { if (threadIdx == 1) { active_child = part.get_active_index(sub_id); passive_child = part.get_passive_index(sub_id); if (this.verbose) LOG.info("Start updating remote counts from mapper: " + update_mapper_id + " on local vertex"); } int num_combinations_verts_sub = this.choose_table[num_colors][num_verts_table[sub_id]]; int active_index = part.get_active_index(sub_id); int num_verts_a = num_verts_table[active_index]; // colorset combinations from active child // combination of colors for active child int num_combinations_active_ato = this.choose_table[num_verts_table[sub_id]][num_verts_a]; // to calculate chunk size int comb_len = this.dt.get_num_color_set(this.part.get_passive_index(sub_id)); if (verbose && threadIdx == 1) LOG.info(" Comb len: " + comb_len); // this decompress counts will be reused float[] decompress_counts = new float[comb_len]; // accumulate the updates on a n position, then writes to the dt table double[] update_at_n = new double[num_combinations_verts_sub]; // start update // first loop over local v int effect_count = 0; for (int v = chunks[threadIdx - 1]; v < chunks[threadIdx]; ++v) { if (dt.is_vertex_init_active(v)) { //clear the update_at_n array for (int x = 0; x < num_combinations_verts_sub; x++) update_at_n[x] = 0.0d; int adj_list_size = this.update_map_size[v]; if (adj_list_size == 0) continue; // ----------------- cache and re-use the map_ids, chunk_ids, and chunk_internal_offsets in each rotation ----------------- // // assertTrue("adj_list_size 0", (adj_list_size > 0)); // store the abs adj id for v int[] adj_list = this.update_map[v]; // assertTrue("adj_list null", (adj_list != null)); float[] counts_a = dt.get_active(v); // assertTrue("counts_a null", (counts_a != null)); int compress_interval = 0; // retrieve map_id and chunk id for each adj in adj_list int[] map_ids = this.map_ids_cache_pip[v]; int[] chunk_ids = this.chunk_ids_cache_pip[v]; int[] chunk_internal_offsets = this.chunk_internal_offsets_cache_pip[v]; //second loop over nbrs, decompress adj from Scset for (int rand_i = 0; rand_i < adj_list_size; rand_i++) { if (map_ids[rand_i] != update_mapper_id || this.update_queue_pos[map_ids[rand_i]] == null || this.update_queue_pos[map_ids[rand_i]][chunk_ids[rand_i]] == null) { continue; } //get a non-null adj, received by pipeline comm in last turn int[] adj_offset_list = this.update_queue_pos[map_ids[rand_i]][chunk_ids[rand_i]]; // get the offset pos int start_pos = adj_offset_list[chunk_internal_offsets[rand_i]]; int end_pos = adj_offset_list[chunk_internal_offsets[rand_i] + 1]; if (start_pos != end_pos) { // the num of compressed counts compress_interval = end_pos - start_pos; // get the compressed counts list, all nonzero elements float[] adj_counts_list = this.update_queue_counts[map_ids[rand_i]][chunk_ids[rand_i]]; short[] adj_index_list = this.update_queue_index[map_ids[rand_i]][chunk_ids[rand_i]]; // assertTrue("adj_counts_list null", (adj_counts_list != null)); // assertTrue("adj_index_list null", (adj_index_list != null)); // ----------- start decompress process ----------- for (int x = 0; x < comb_len; x++) decompress_counts[x] = 0.0f; // assertTrue("compress_interval negative", (compress_interval > 0)); for (int x = 0; x < compress_interval; x++) { // assertTrue("adj index out of bound", (adj_index_list[start_pos + x] < comb_len && adj_index_list[start_pos + x] >=0 )); // assertTrue(" adj counts list out of bound", (start_pos + x < adj_counts_list.length)); decompress_counts[(int) adj_index_list[start_pos + x]] = adj_counts_list[start_pos + x]; } // ----------- Finish decompress process ----------- //third loop over comb_num for cur subtemplate for (int n = 0; n < num_combinations_verts_sub; n++) { // more details int[] comb_indexes_a = this.comb_num_indexes[0][sub_id][n]; int[] comb_indexes_p = this.comb_num_indexes[1][sub_id][n]; // assertTrue("comb_indexes_a null", (comb_indexes_a != null)); // assertTrue("comb_indexes_p null", (comb_indexes_p != null)); // for passive int p = num_combinations_active_ato - 1; // // fourth loop over comb_num for active/passive subtemplates for (int a = 0; a < num_combinations_active_ato; ++a, --p) { // assertTrue("counts_a null", (counts_a != null)); // assertTrue("a out of bound ", (a < comb_indexes_a.length)); // assertTrue("comb_indexes_a not in range", (comb_indexes_a[a] < counts_a.length)); float count_a = counts_a[comb_indexes_a[a]]; if (count_a > 0) { update_at_n[n] += ((double) count_a * decompress_counts[comb_indexes_p[p]]); } } } // finish all combination sets for cur template } // finish all nonzero adj } // finish all adj of a v //write upated value for (int n = 0; n < num_combinations_verts_sub; n++) { if (sub_id != 0) dt.update_comm(v, comb_num_indexes_set[sub_id][n], (float) update_at_n[n]); else count_comm_root[threadIdx] += update_at_n[n]; } effect_count++; if (this.verbose && threadIdx == 1 && (effect_count % 1000 == 0)) LOG.info("Trace update counts for Thd 1: " + effect_count); } // finishe an active v } // finish all the v on thread //debug if (this.verbose && threadIdx == 1) LOG.info("Trace update counts for Thd 1 Finished"); decompress_counts = null; update_at_n = null; if (verbose && threadIdx == 1) LOG.info("Finish updating remote counts from mapper: " + update_mapper_id + " on local vertex"); } /** * @brief compress local vert data for remote mappers * cache the compressed data of each local v into a global lookup table * this will consume more memory space * single thread * * @param vert_list * @param num_comb_max * * @return */ public SCSet compress_send_data_cached(int[] vert_list, int num_comb_max) { this.start_comm = System.currentTimeMillis(); int v_num = vert_list.length; int[] v_offset = new int[v_num + 1]; //to be trimed //counts_idx_tmp is not required, can be removed float[] counts_data_tmp = new float[num_comb_max * v_num]; // float[] compress_array = new float[num_comb_max]; // compress index uses short to save memory, support up to 32767 as max_comb_len short[] counts_index_tmp = new short[num_comb_max * v_num]; // short[] compress_index = new short[num_comb_max]; int count_num = 0; int effective_v = 0; for (int i = 0; i < v_num; i++) { v_offset[i] = count_num; //get the abs vert id int comm_vert_id = vert_list[i]; int rel_vert_id = this.g.get_relative_v_id(comm_vert_id); //if comm_vert_id is not in local graph if (rel_vert_id < 0 || (this.dt.is_vertex_init_passive(rel_vert_id) == false)) continue; // float[] counts_arry = null; float[] compress_counts = this.compress_cache_array[rel_vert_id]; short[] compress_index = this.compress_cache_index[rel_vert_id]; if (compress_counts == null) { //compress the sending counts and index float[] counts_raw = this.dt.get_passive(rel_vert_id); if (counts_raw == null) { LOG.info("ERROR: null passive counts array"); continue; } //check length if (counts_raw.length != num_comb_max) { LOG.info("ERROR: comb_max and passive counts len not matched"); continue; } // further compress nonzero values // float[] compress_counts_tmp = new float[num_comb_max]; // short[] compress_index_tmp = new short[num_comb_max]; // int compress_itr = 0; // for(int j=0; j<counts_raw.length; j++) // { // if (counts_raw[j] > 0.0f) // { // compress_counts_tmp[compress_itr] = counts_raw[j]; // compress_index_tmp[compress_itr] = (short)j; // compress_itr++; // } // } // // trim the tmp arrays // compress_counts = new float[compress_itr]; // compress_index = new short[compress_itr]; // System.arraycopy(compress_counts_tmp, 0, compress_counts, 0, compress_itr); // System.arraycopy(compress_index_tmp, 0, compress_index, 0, compress_itr); // further compress nonzero values compress_counts = new float[num_comb_max]; compress_index = new short[num_comb_max]; int compress_itr = 0; for (int j = 0; j < counts_raw.length; j++) { if (counts_raw[j] > 0.0f) { compress_counts[compress_itr] = counts_raw[j]; compress_index[compress_itr] = (short) j; compress_itr++; } } //store compress data in the table this.compress_cache_array[rel_vert_id] = compress_counts; this.compress_cache_index[rel_vert_id] = compress_index; this.compress_cache_len[rel_vert_id] = compress_itr; // compress_counts_tmp = null; // compress_index_tmp = null; } int compress_len = this.compress_cache_len[rel_vert_id]; effective_v++; // System.arraycopy(compress_counts, 0, counts_data_tmp, count_num, compress_counts.length); // System.arraycopy(compress_index, 0, counts_index_tmp, count_num, compress_index.length); // count_num += compress_counts.length; System.arraycopy(compress_counts, 0, counts_data_tmp, count_num, compress_len); System.arraycopy(compress_index, 0, counts_index_tmp, count_num, compress_len); count_num += compress_len; } v_offset[v_num] = count_num; //trim the tmp array float[] counts_data = new float[count_num]; short[] counts_index = new short[count_num]; System.arraycopy(counts_data_tmp, 0, counts_data, 0, count_num); System.arraycopy(counts_index_tmp, 0, counts_index, 0, count_num); counts_data_tmp = null; counts_index_tmp = null; SCSet set = new SCSet(v_num, count_num, v_offset, counts_data, counts_index); if (this.verbose) { long effective_size = ((long) effective_v * num_comb_max); LOG.info("Estimated counts size w/o compression: " + effective_size + "; mem usage: " + (effective_size * 4)); LOG.info("Actual counts array size after compression: " + count_num + "; mem usage: " + ((long) count_num * 6)); LOG.info("Save memory " + ((effective_size * 4 - (long) count_num * 6) / (double) (effective_size * 4)) * 100.0f + "%"); } this.time_comm += (System.currentTimeMillis() - this.start_comm); return set; } public SCSet compress_send_data(int[] vert_list, int num_comb_max) { this.start_comm = System.currentTimeMillis(); int v_num = vert_list.length; int[] v_offset = new int[v_num + 1]; //to be trimed //counts_idx_tmp is not required, can be removed float[] counts_data_tmp = new float[num_comb_max * v_num]; float[] compress_array = new float[num_comb_max]; // compress index uses short to save memory, support up to 32767 as max_comb_len short[] counts_index_tmp = new short[num_comb_max * v_num]; short[] compress_index = new short[num_comb_max]; int count_num = 0; int effective_v = 0; for (int i = 0; i < v_num; i++) { v_offset[i] = count_num; //get the abs vert id int comm_vert_id = vert_list[i]; int rel_vert_id = this.g.get_relative_v_id(comm_vert_id); //if comm_vert_id is not in local graph if (rel_vert_id < 0 || (this.dt.is_vertex_init_passive(rel_vert_id) == false)) continue; //compress the sending counts and index float[] counts_raw = this.dt.get_passive(rel_vert_id); if (counts_raw == null) { LOG.info("ERROR: null passive counts array"); continue; } //check length if (counts_raw.length != num_comb_max) { LOG.info("ERROR: comb_max and passive counts len not matched"); continue; } int compress_itr = 0; for (int j = 0; j < counts_raw.length; j++) { if (counts_raw[j] > 0.0f) { compress_array[compress_itr] = counts_raw[j]; compress_index[compress_itr] = (short) j; compress_itr++; } } effective_v++; System.arraycopy(compress_array, 0, counts_data_tmp, count_num, compress_itr); System.arraycopy(compress_index, 0, counts_index_tmp, count_num, compress_itr); count_num += compress_itr; } v_offset[v_num] = count_num; //trim the tmp array float[] counts_data = new float[count_num]; short[] counts_index = new short[count_num]; System.arraycopy(counts_data_tmp, 0, counts_data, 0, count_num); System.arraycopy(counts_index_tmp, 0, counts_index, 0, count_num); counts_data_tmp = null; counts_index_tmp = null; compress_array = null; compress_index = null; SCSet set = new SCSet(v_num, count_num, v_offset, counts_data, counts_index); if (this.verbose) { long effective_size = ((long) effective_v * num_comb_max); LOG.info("Estimated counts size w/o compression: " + effective_size + "; mem usage: " + (effective_size * 4)); LOG.info("Actual counts array size after compression: " + count_num + "; mem usage: " + ((long) count_num * 6)); LOG.info("Save memory " + ((effective_size * 4 - (long) count_num * 6) / (double) (effective_size * 4)) * 100.0f + "%"); } this.time_comm += (System.currentTimeMillis() - this.start_comm); return set; } /** * @brief calculate and printout counts number for each intermediate subtemplates * * @param sub_id * @param threadIdx * @param chunks * * @return */ private void print_counts(int sub_id, int threadIdx, int[] chunks) throws BrokenBarrierException, InterruptedException { cc_ato[threadIdx] = 0.0; for (int v = chunks[threadIdx]; v < chunks[threadIdx + 1]; ++v) { float[] counts_a_test_cur = dt.get_table(sub_id, v); if (counts_a_test_cur != null) { for (int x = 0; x < counts_a_test_cur.length; x++) cc_ato[threadIdx] += (double) counts_a_test_cur[x]; } } } public long get_comm_time() { return this.time_comm; } public long get_sync_time() { return this.time_sync; } /** * @brief update the received counts for each local vertex * * @param sub_id * @param threadIdx * @param chunks * * @return */ private void regroup_update_all(int sub_id, int threadIdx, int[] chunks) throws BrokenBarrierException, InterruptedException { // single thread communication if (threadIdx == 0) regroup_comm_all(sub_id); this.barrier.await(); //release the cached sending data //after regroup update for one subtemplate if (threadIdx == 0) { for (int i = 0; i < this.num_verts_graph; i++) { this.compress_cache_array[i] = null; this.compress_cache_index[i] = null; } } this.barrier.await(); // precompute the maperids, chunkids, and offsets for adjlist of each local v calculate_update_ids(sub_id, threadIdx); this.barrier.await(); try { update_comm_all(sub_id, threadIdx, chunks); } catch (InterruptedException | BrokenBarrierException e) { e.printStackTrace(); } this.barrier.await(); //release precomputed ids release_update_ids(threadIdx); this.barrier.await(); if (threadIdx == 0) recycleMem(); } /** * @brief pre-computing the chunk ids and offsets in communication packets * * @param sub_id * @param threadIdx * * @return */ private void calculate_update_ids(int sub_id, int threadIdx) throws BrokenBarrierException, InterruptedException { int comb_len = this.dt.get_num_color_set(this.part.get_passive_index(sub_id)); for (int v = this.chunks[threadIdx]; v < this.chunks[threadIdx + 1]; ++v) { if (dt.is_vertex_init_active(v)) { int adj_list_size = this.update_map_size[v]; if (adj_list_size == 0) continue; // ----------------- cache and re-use the map_ids, chunk_ids, and chunk_internal_offsets in each rotation ----------------- // assertTrue("adj_list_size 0", (adj_list_size > 0)); // store the abs adj id for v int[] adj_list = this.update_map[v]; // assertTrue("adj_list null", (adj_list != null)); // retrieve map_id and chunk id for each adj in adj_list int[] map_ids = new int[adj_list_size]; int[] chunk_ids = new int[adj_list_size]; int[] chunk_internal_offsets = new int[adj_list_size]; int adj_id = 0; int adj_offset = 0; int map_id = 0; int adj_list_len = 0; int chunk_size = 0; int chunk_len = 0; int chunk_id = 0; int chunk_internal_offset = 0; int compress_interval = 0; //calculate map_ids, chunk_ids, and chunk_internal_offset for (int j = 0; j < adj_list_size; j++) { adj_id = adj_list[j]; adj_offset = this.abs_v_to_queue[adj_id]; map_id = this.abs_v_to_mapper[adj_id]; adj_list_len = this.update_mapper_len[map_id]; // assertTrue("adj_list_len < 1", (adj_list_len > 0)); //calculate chunk id chunk_size = (int) ((adj_list_len * (long) comb_len + this.send_array_limit - 1) / this.send_array_limit); chunk_len = adj_list_len / (int) chunk_size; // from 0 to chunk_size - 1 chunk_id = adj_offset / chunk_len; chunk_internal_offset = adj_offset % chunk_len; // assertTrue("chunk id non zero: ", (chunk_id == 0)); // reminder if (chunk_id > chunk_size - 1) { chunk_id = chunk_size - 1; chunk_internal_offset += chunk_len; } map_ids[j] = map_id; chunk_ids[j] = chunk_id; chunk_internal_offsets[j] = chunk_internal_offset; } // store ids for this v this.map_ids_cache_pip[v] = map_ids; this.chunk_ids_cache_pip[v] = chunk_ids; this.chunk_internal_offsets_cache_pip[v] = chunk_internal_offsets; } // finishe an active v } } private void release_update_ids(int threadIdx) { for (int v = this.chunks[threadIdx]; v < this.chunks[threadIdx + 1]; ++v) { this.map_ids_cache_pip[v] = null; this.chunk_ids_cache_pip[v] = null; this.chunk_internal_offsets_cache_pip[v] = null; } } /** * @brief regourp and updates cross-partition counts in a pipelined way. * The original AlltoAll like regroup is decomposed into this.mapper_num times * Each turn, thread 0 sends/receives partitions to/from a certain mapper while the * other threads update the local counts by received data in previous turn. The send/recive * sequence is predefined. * * pipeline version requires at least two Java threads * * @param sub_id * @param threadIdx * @param chunks * * @return */ private void regroup_update_pipeline(int sub_id, int threadIdx) throws BrokenBarrierException, InterruptedException { // first turn of regroup data send to its neighbour mapper id // get the update id as the received mapper id if (threadIdx == 0) { this.pipeline_send_id = (this.local_mapper_id + 1) % this.mapper_num; this.pipeline_recv_id = regroup_comm_atomic(sub_id, this.pipeline_send_id); this.pipeline_update_id = this.pipeline_recv_id; } this.barrier.await(); // precompute the maperids, chunkids, and offsets for adjlist of each local v calculate_update_ids(sub_id, threadIdx); this.barrier.await(); // start update if (this.mapper_num == 2) { //no need of pipeline all the data transferred update_comm_all(sub_id, threadIdx, this.chunks); this.barrier.await(); if (threadIdx == 0) recycleMem(); this.barrier.await(); } else { //start pipelined comm and update //mapper num > 2 this.count_comm_root[threadIdx] = 0.0d; for (int i = 0; i < this.mapper_num - 2; i++) { if (threadIdx == 0) { this.pipeline_send_id++; this.pipeline_send_id = this.pipeline_send_id % this.mapper_num; this.pipeline_recv_id = regroup_comm_atomic(sub_id, this.pipeline_send_id); if (this.verbose) LOG.info("Pipeline " + i + " finish comm send id: " + this.pipeline_send_id + "; recv id: " + this.pipeline_recv_id); } else { //update local received data from previous turn update_comm_atomic(sub_id, this.pipeline_update_id, threadIdx, this.chunks_pipeline); if (this.verbose && threadIdx == 1) LOG.info( "Pipeline " + i + " finish compute on data from mapper " + this.pipeline_update_id); } // sync and wait for the finish of comm and update this.barrier.await(); if (threadIdx == 0) { if (this.verbose) LOG.info("Start Pipeline Mem Recycle"); recycleMemPipeline(this.pipeline_update_id); this.pipeline_update_id = this.pipeline_recv_id; if (this.verbose) LOG.info("Finish Pipeline Mem Recycle"); } this.barrier.await(); } // finish the udpating of comm data in the last pipeline step if (this.verbose && threadIdx == 0) LOG.info("Update Last remain of pipeline"); if (threadIdx != 0) update_comm_atomic(sub_id, this.pipeline_update_id, threadIdx, this.chunks_pipeline); this.barrier.await(); if (threadIdx == 0) recycleMem(); this.barrier.await(); } //release the compressed sending data if (threadIdx == 0) { for (int i = 0; i < this.num_verts_graph; i++) { this.compress_cache_array[i] = null; this.compress_cache_index[i] = null; } } this.barrier.await(); //release precomputed ids release_update_ids(threadIdx); this.barrier.await(); if (this.verbose && threadIdx == 0) LOG.info("Finish pipeline for sub: " + sub_id); } /** * @brief release the JVM memory and harp comm table * * @return */ private void recycleMem() { // clean up memory if (this.comm_data_table != null) { this.comm_data_table.free(); this.comm_data_table = null; } for (int k = 0; k < this.mapper_num; k++) { if (this.update_queue_pos[k] != null) { for (int x = 0; x < this.update_queue_pos[k].length; x++) this.update_queue_pos[k][x] = null; } this.update_queue_pos[k] = null; if (this.update_queue_counts[k] != null) { for (int x = 0; x < this.update_queue_counts[k].length; x++) this.update_queue_counts[k][x] = null; } this.update_queue_counts[k] = null; if (this.update_queue_index[k] != null) { for (int x = 0; x < this.update_queue_index[k].length; x++) this.update_queue_index[k][x] = null; } this.update_queue_index[k] = null; } ConnPool.get().clean(); System.gc(); } /** * @brief release the JVM memory and harp comm table * after a pipeline step * * @return */ private void recycleMemPipeline(int k) { if (this.comm_data_table != null) { // remove items from inUseMap this.comm_data_table.free(); this.comm_data_table = null; } // only clean up memory associated with clear_id (mapper) if (this.update_queue_pos[k] != null) { for (int x = 0; x < this.update_queue_pos[k].length; x++) this.update_queue_pos[k][x] = null; } this.update_queue_pos[k] = null; if (this.update_queue_counts[k] != null) { for (int x = 0; x < this.update_queue_counts[k].length; x++) this.update_queue_counts[k][x] = null; } this.update_queue_counts[k] = null; if (this.update_queue_index[k] != null) { for (int x = 0; x < this.update_queue_index[k].length; x++) this.update_queue_index[k][x] = null; } this.update_queue_index[k] = null; ConnPool.get().clean(); System.gc(); } }