java - > vs. >= causes significant performance difference -


i stumbled upon something. @ first thought might case of branch misprediction in case, cannot explain why branch misprediction should cause phenomenon. implemented 2 versions of bubble sort in java , did performance tests:

import java.util.random;  public class bubblesortannomaly {      public static void main(string... args) {         final int array_size = integer.parseint(args[0]);         final int limit = integer.parseint(args[1]);         final int runs = integer.parseint(args[2]);          int[] = new int[array_size];         int[] b = new int[array_size];         random r = new random();         (int run = 0; runs > run; ++run) {             (int = 0; < array_size; i++) {                 a[i] = r.nextint(limit);                 b[i] = a[i];             }              system.out.print("sorting sorta: ");             long start = system.nanotime();             int swaps = bubblesorta(a);              system.out.println(  (system.nanotime() - start) + " ns. "                                + "it used " + swaps + " swaps.");              system.out.print("sorting sortb: ");             start = system.nanotime();             swaps = bubblesortb(b);              system.out.println(  (system.nanotime() - start) + " ns. "                                + "it used " + swaps + " swaps.");         }     }      public static int bubblesorta(int[] a) {         int counter = 0;         (int = a.length - 1; >= 0; --i) {             (int j = 0; j < i; ++j) {                 if (a[j] > a[j + 1]) {                     swap(a, j, j + 1);                     ++counter;                 }             }         }         return (counter);     }      public static int bubblesortb(int[] a) {         int counter = 0;         (int = a.length - 1; >= 0; --i) {             (int j = 0; j < i; ++j) {                 if (a[j] >= a[j + 1]) {                     swap(a, j, j + 1);                     ++counter;                 }             }         }         return (counter);     }      private static void swap(int[] a, int j, int i) {         int h = a[i];         a[i] = a[j];         a[j] = h;     } } 

as can see, difference between 2 sorting methods > vs. >=. when running program java bubblesortannomaly 50000 10 10, expect sortb slower sorta. got following (or similar) output on 3 different machines:

sorting sorta: 4.214 seconds. used  564960211 swaps. sorting sortb: 2.278 seconds. used 1249750569 swaps. sorting sorta: 4.199 seconds. used  563355818 swaps. sorting sortb: 2.254 seconds. used 1249750348 swaps. sorting sorta: 4.189 seconds. used  560825110 swaps. sorting sortb: 2.264 seconds. used 1249749572 swaps. sorting sorta: 4.17  seconds. used  561924561 swaps. sorting sortb: 2.256 seconds. used 1249749766 swaps. sorting sorta: 4.198 seconds. used  562613693 swaps. sorting sortb: 2.266 seconds. used 1249749880 swaps. sorting sorta: 4.19  seconds. used  561658723 swaps. sorting sortb: 2.281 seconds. used 1249751070 swaps. sorting sorta: 4.193 seconds. used  564986461 swaps. sorting sortb: 2.266 seconds. used 1249749681 swaps. sorting sorta: 4.203 seconds. used  562526980 swaps. sorting sortb: 2.27  seconds. used 1249749609 swaps. sorting sorta: 4.176 seconds. used  561070571 swaps. sorting sortb: 2.241 seconds. used 1249749831 swaps. sorting sorta: 4.191 seconds. used  559883210 swaps. sorting sortb: 2.257 seconds. used 1249749371 swaps. 

when set parameter limit to, e.g., 50000 (java bubblesortannomaly 50000 50000 10), expected results:

sorting sorta: 3982697438 ns. used 625941897 swaps. sorting sortb: 4657909823 ns. used 789391382 swaps. 

i ported program c++ determine whether problem java-specific. here c++ code.

#include <cstdlib> #include <iostream>  #include <omp.h>  #ifndef array_size #define array_size 50000 #endif  #ifndef limit #define limit 10 #endif  #ifndef runs #define runs 10 #endif  void swap(int * a, int i, int j) {     int h = a[i];     a[i] = a[j];     a[j] = h; }  int bubblesorta(int * a) {     const int last = array_size - 1;     int counter = 0;     (int = last; > 0; --i)     {         (int j = 0; j < i; ++j)         {             int next = j + 1;             if (a[j] > a[next])             {                 swap(a, j, next);                 ++counter;             }         }     }     return (counter); }  int bubblesortb(int * a) {     const int last = array_size - 1;     int counter = 0;     (int = last; > 0; --i)     {         (int j = 0; j < i; ++j)         {             int next = j + 1;             if (a[j] >= a[next])             {                 swap(a, j, next);                 ++counter;             }         }     }     return (counter); }  int main() {     int * = (int *) malloc(array_size * sizeof(int));     int * b = (int *) malloc(array_size * sizeof(int));     (int run = 0; runs > run; ++run)     {         (int idx = 0; idx < array_size; ++idx)         {             a[idx] = std::rand() % limit;             b[idx] = a[idx];         }          std::cout << "sorting sorta: ";         double start = omp_get_wtime();         int swaps = bubblesorta(a);          std::cout << (omp_get_wtime() - start) << " seconds. used " << swaps                   << " swaps." << std::endl;          std::cout << "sorting sortb: ";         start = omp_get_wtime();         swaps = bubblesortb(b);          std::cout << (omp_get_wtime() - start) << " seconds. used " << swaps                   << " swaps." << std::endl;     }      free(a);     free(b);      return (0); } 

this program shows same behaviour. can explain, going on here?

executing sortb first , sorta not change results.

i think may indeed due branch prediction. if count number of swaps compared number of inner sort iterations find:

limit = 10

  • a = 560m swaps / 1250m loops
  • b = 1250m swaps / 1250m loops (0.02% less swaps loops)

limit = 50000

  • a = 627m swaps / 1250m loops
  • b = 850m swaps / 1250m loops

so in limit == 10 case swap performed 99.98% of time in b sort favourable branch predictor. in limit == 50000 case swap hit randomly 68% branch predictor less beneficial.


Comments

Popular posts from this blog

apache - PHP Soap issue while content length is larger -

asynchronous - Python asyncio task got bad yield -

javascript - Complete OpenIDConnect auth when requesting via Ajax -