Uploaded image for project: 'JDK'
  1. JDK
  2. JDK-6799997

Enhance HotSpot for unsigned byte load

    • Icon: Enhancement Enhancement
    • Resolution: Duplicate
    • Icon: P5 P5
    • None
    • 7
    • hotspot
    • x86
    • windows_xp

      A DESCRIPTION OF THE REQUEST :
      because unsigned byte load is not as fast as it could be, decoders from sun.nio.cs package use the +128 trick (See source code section).

      See discussion on hotspot-compiler-dev mailing list.

      I'm working on speeding up charset de-/encoders:
      https://java-nio-charset-enhanced.dev.java.net/

        To finally decide for the fastest solution, this RFE should be solved.



      JUSTIFICATION :
        From CPU side of view, loading an unsigned byte should be as fast as or faster than signed loading + adding 128.



      ---------- BEGIN SOURCE ----------
      import org.junit.*;

      /**
       *
       * @author Ulf Zibis <Ulf.Zibis at CoSoCo.de>
       */
      public class DecoderBenchmark {

          static final char[] map = new char[256];
          static final byte[] src = new byte[131072]; // exceed CPU L1-cache for real world constraint
          static final char[] dst = new char[131072]; // exceed CPU L1-cache for real world constraint
          static final int LOOPS = 125;
      // static final byte[] src = new byte[16384]; // don't exceed CPU L1-cache
      // static final char[] dst = new char[16384]; // don't exceed CPU L1-cache
      // static final int LOOPS = 1000;
      // static final byte[] src = new byte[2048]; // far below of exceeding CPU L1-cache
      // static final char[] dst = new char[2048]; // far below of exceeding CPU L1-cache
      // static final int LOOPS = 8000;
          static final int OUTER_LOOPS = 100;
          static final int WARMUP_LOOPS = 4;
          static final float WARMUP_RATIO = 0.25f;

          @Test
          public void foo() {
              // fill arrays, to force real memory load and prohibit HotSpot from just incrementing
              // (maybe candidate for sophisticated HotSpot optimization ;-) )
              for (int i=0; i<map.length; i++)
                  map[i] = (char)(59 * (227 - i));
              for (int i=0; i<src.length; i++)
                  src[i] = (byte)(13 * (17 - i));
              // warm up:
              long time = System.nanoTime();
              long lastWarmUpTime = 0;
              for (int h=0; h<WARMUP_LOOPS; ) {
                  for (int i=0; i<WARMUP_RATIO*OUTER_LOOPS; i++) {
                      for (int j=0; j<LOOPS; j++)
                          bar1(src, dst);
                      for (int j=0; j<LOOPS; j++)
                          bar2(src, dst);
                      for (int j=0; j<LOOPS; j++)
                          bar3(src, dst);
                      for (int j=0; j<LOOPS; j++)
                          bar4(src, dst);
                  }
                  lastWarmUpTime = System.nanoTime()-time;
                  System.out.println("time for warm up "+(++h)+": "+(lastWarmUpTime)/1000000+" ms");
                  time = System.nanoTime();// don't count time for print ;-)
              }
              long time1 = 0;
              long time2 = 0;
              long time3 = 0;
              long time4 = 0;
              // swap decoders to eliminate influence of
              // other processes and CPU clockdown, caused by overheating
              for (int i=0; i<OUTER_LOOPS; i++) {
                  for (int j=0; j<LOOPS; j++)
                      bar1(src, dst);
                  time1 -= time - (time = System.nanoTime());
                  for (int j=0; j<LOOPS; j++)
                      bar2(src, dst);
                  time2 -= time - (time = System.nanoTime());
                  for (int j=0; j<LOOPS; j++)
                      bar3(src, dst);
                  time3 -= time - (time = System.nanoTime());
                  for (int j=0; j<LOOPS; j++)
                      bar4(src, dst);
                  time4 -= time - (time = System.nanoTime());
              }
              System.out.println("time for map[a & 0xFF]: "+time1/1000000+" ms");
              System.out.println("time for map[a + 0x80]: "+time2/1000000+" ms");
              System.out.println("time for inlined map[a & 0xFF]: "+time3/1000000+" ms");
              System.out.println("time for inlined map[a + 0x80]: "+time4/1000000+" ms");
              System.out.println("last warm up ./. test loops: "
                      +(float)lastWarmUpTime/(time1+time2+time3+time4)/WARMUP_RATIO);
          }

          static void bar1(byte[] src, char[] dst) {
              for (int i=0; i<src.length; i++)
                  dst[i] = decode1(src[i]);
          }
          static void bar2(byte[] src, char[] dst) {
              for (int i=0; i<src.length; i++)
                  dst[i] = decode2(src[i]);
          }
          static void bar3(byte[] src, char[] dst) {
              for (int i=0; i<src.length; i++)
                  dst[i] = map[src[i] & 0xFF];
          }
          static void bar4(byte[] src, char[] dst) {
              for (int i=0; i<src.length; i++)
                  dst[i] = map[src[i] + 0x80];
          }

          public static char decode1(byte a) {
              return map[a & 0xFF];
          }
          public static char decode2(byte a) {
              return map[a + 0x80];
          }

          public static void main(String[] args) {
              DecoderBenchmark dbm = new DecoderBenchmark();
              dbm.foo();
          }
      }

      ---------- END SOURCE ----------

            twisti Christian Thalinger (Inactive)
            ndcosta Nelson Dcosta (Inactive)
            Votes:
            0 Vote for this issue
            Watchers:
            0 Start watching this issue

              Created:
              Updated:
              Resolved:
              Imported:
              Indexed: