java - > vs. >= causes significant performance difference -
i stumbled upon something. @ first thought might case of branch misprediction in case, cannot explain why branch misprediction should cause phenomenon. implemented 2 versions of bubble sort in java , did performance tests:
import java.util.random; public class bubblesortannomaly { public static void main(string... args) { final int array_size = integer.parseint(args[0]); final int limit = integer.parseint(args[1]); final int runs = integer.parseint(args[2]); int[] = new int[array_size]; int[] b = new int[array_size]; random r = new random(); (int run = 0; runs > run; ++run) { (int = 0; < array_size; i++) { a[i] = r.nextint(limit); b[i] = a[i]; } system.out.print("sorting sorta: "); long start = system.nanotime(); int swaps = bubblesorta(a); system.out.println( (system.nanotime() - start) + " ns. " + "it used " + swaps + " swaps."); system.out.print("sorting sortb: "); start = system.nanotime(); swaps = bubblesortb(b); system.out.println( (system.nanotime() - start) + " ns. " + "it used " + swaps + " swaps."); } } public static int bubblesorta(int[] a) { int counter = 0; (int = a.length - 1; >= 0; --i) { (int j = 0; j < i; ++j) { if (a[j] > a[j + 1]) { swap(a, j, j + 1); ++counter; } } } return (counter); } public static int bubblesortb(int[] a) { int counter = 0; (int = a.length - 1; >= 0; --i) { (int j = 0; j < i; ++j) { if (a[j] >= a[j + 1]) { swap(a, j, j + 1); ++counter; } } } return (counter); } private static void swap(int[] a, int j, int i) { int h = a[i]; a[i] = a[j]; a[j] = h; } }
as can see, difference between 2 sorting methods >
vs. >=
. when running program java bubblesortannomaly 50000 10 10
, expect sortb
slower sorta
. got following (or similar) output on 3 different machines:
sorting sorta: 4.214 seconds. used 564960211 swaps. sorting sortb: 2.278 seconds. used 1249750569 swaps. sorting sorta: 4.199 seconds. used 563355818 swaps. sorting sortb: 2.254 seconds. used 1249750348 swaps. sorting sorta: 4.189 seconds. used 560825110 swaps. sorting sortb: 2.264 seconds. used 1249749572 swaps. sorting sorta: 4.17 seconds. used 561924561 swaps. sorting sortb: 2.256 seconds. used 1249749766 swaps. sorting sorta: 4.198 seconds. used 562613693 swaps. sorting sortb: 2.266 seconds. used 1249749880 swaps. sorting sorta: 4.19 seconds. used 561658723 swaps. sorting sortb: 2.281 seconds. used 1249751070 swaps. sorting sorta: 4.193 seconds. used 564986461 swaps. sorting sortb: 2.266 seconds. used 1249749681 swaps. sorting sorta: 4.203 seconds. used 562526980 swaps. sorting sortb: 2.27 seconds. used 1249749609 swaps. sorting sorta: 4.176 seconds. used 561070571 swaps. sorting sortb: 2.241 seconds. used 1249749831 swaps. sorting sorta: 4.191 seconds. used 559883210 swaps. sorting sortb: 2.257 seconds. used 1249749371 swaps.
when set parameter limit
to, e.g., 50000
(java bubblesortannomaly 50000 50000 10
), expected results:
sorting sorta: 3982697438 ns. used 625941897 swaps. sorting sortb: 4657909823 ns. used 789391382 swaps.
i ported program c++ determine whether problem java-specific. here c++ code.
#include <cstdlib> #include <iostream> #include <omp.h> #ifndef array_size #define array_size 50000 #endif #ifndef limit #define limit 10 #endif #ifndef runs #define runs 10 #endif void swap(int * a, int i, int j) { int h = a[i]; a[i] = a[j]; a[j] = h; } int bubblesorta(int * a) { const int last = array_size - 1; int counter = 0; (int = last; > 0; --i) { (int j = 0; j < i; ++j) { int next = j + 1; if (a[j] > a[next]) { swap(a, j, next); ++counter; } } } return (counter); } int bubblesortb(int * a) { const int last = array_size - 1; int counter = 0; (int = last; > 0; --i) { (int j = 0; j < i; ++j) { int next = j + 1; if (a[j] >= a[next]) { swap(a, j, next); ++counter; } } } return (counter); } int main() { int * = (int *) malloc(array_size * sizeof(int)); int * b = (int *) malloc(array_size * sizeof(int)); (int run = 0; runs > run; ++run) { (int idx = 0; idx < array_size; ++idx) { a[idx] = std::rand() % limit; b[idx] = a[idx]; } std::cout << "sorting sorta: "; double start = omp_get_wtime(); int swaps = bubblesorta(a); std::cout << (omp_get_wtime() - start) << " seconds. used " << swaps << " swaps." << std::endl; std::cout << "sorting sortb: "; start = omp_get_wtime(); swaps = bubblesortb(b); std::cout << (omp_get_wtime() - start) << " seconds. used " << swaps << " swaps." << std::endl; } free(a); free(b); return (0); }
this program shows same behaviour. can explain, going on here?
executing sortb
first , sorta
not change results.
i think may indeed due branch prediction. if count number of swaps compared number of inner sort iterations find:
limit = 10
- a = 560m swaps / 1250m loops
- b = 1250m swaps / 1250m loops (0.02% less swaps loops)
limit = 50000
- a = 627m swaps / 1250m loops
- b = 850m swaps / 1250m loops
so in limit == 10
case swap performed 99.98% of time in b sort favourable branch predictor. in limit == 50000
case swap hit randomly 68% branch predictor less beneficial.
Comments
Post a Comment