A DESCRIPTION OF THE REQUEST :
because unsigned byte load is not as fast as it could be, decoders from sun.nio.cs package use the +128 trick (See source code section).
See discussion on hotspot-compiler-dev mailing list.
I'm working on speeding up charset de-/encoders:
https://java-nio-charset-enhanced.dev.java.net/
To finally decide for the fastest solution, this RFE should be solved.
JUSTIFICATION :
From CPU side of view, loading an unsigned byte should be as fast as or faster than signed loading + adding 128.
---------- BEGIN SOURCE ----------
import org.junit.*;
/**
*
* @author Ulf Zibis <Ulf.Zibis at CoSoCo.de>
*/
public class DecoderBenchmark {
static final char[] map = new char[256];
static final byte[] src = new byte[131072]; // exceed CPU L1-cache for real world constraint
static final char[] dst = new char[131072]; // exceed CPU L1-cache for real world constraint
static final int LOOPS = 125;
// static final byte[] src = new byte[16384]; // don't exceed CPU L1-cache
// static final char[] dst = new char[16384]; // don't exceed CPU L1-cache
// static final int LOOPS = 1000;
// static final byte[] src = new byte[2048]; // far below of exceeding CPU L1-cache
// static final char[] dst = new char[2048]; // far below of exceeding CPU L1-cache
// static final int LOOPS = 8000;
static final int OUTER_LOOPS = 100;
static final int WARMUP_LOOPS = 4;
static final float WARMUP_RATIO = 0.25f;
@Test
public void foo() {
// fill arrays, to force real memory load and prohibit HotSpot from just incrementing
// (maybe candidate for sophisticated HotSpot optimization ;-) )
for (int i=0; i<map.length; i++)
map[i] = (char)(59 * (227 - i));
for (int i=0; i<src.length; i++)
src[i] = (byte)(13 * (17 - i));
// warm up:
long time = System.nanoTime();
long lastWarmUpTime = 0;
for (int h=0; h<WARMUP_LOOPS; ) {
for (int i=0; i<WARMUP_RATIO*OUTER_LOOPS; i++) {
for (int j=0; j<LOOPS; j++)
bar1(src, dst);
for (int j=0; j<LOOPS; j++)
bar2(src, dst);
for (int j=0; j<LOOPS; j++)
bar3(src, dst);
for (int j=0; j<LOOPS; j++)
bar4(src, dst);
}
lastWarmUpTime = System.nanoTime()-time;
System.out.println("time for warm up "+(++h)+": "+(lastWarmUpTime)/1000000+" ms");
time = System.nanoTime();// don't count time for print ;-)
}
long time1 = 0;
long time2 = 0;
long time3 = 0;
long time4 = 0;
// swap decoders to eliminate influence of
// other processes and CPU clockdown, caused by overheating
for (int i=0; i<OUTER_LOOPS; i++) {
for (int j=0; j<LOOPS; j++)
bar1(src, dst);
time1 -= time - (time = System.nanoTime());
for (int j=0; j<LOOPS; j++)
bar2(src, dst);
time2 -= time - (time = System.nanoTime());
for (int j=0; j<LOOPS; j++)
bar3(src, dst);
time3 -= time - (time = System.nanoTime());
for (int j=0; j<LOOPS; j++)
bar4(src, dst);
time4 -= time - (time = System.nanoTime());
}
System.out.println("time for map[a & 0xFF]: "+time1/1000000+" ms");
System.out.println("time for map[a + 0x80]: "+time2/1000000+" ms");
System.out.println("time for inlined map[a & 0xFF]: "+time3/1000000+" ms");
System.out.println("time for inlined map[a + 0x80]: "+time4/1000000+" ms");
System.out.println("last warm up ./. test loops: "
+(float)lastWarmUpTime/(time1+time2+time3+time4)/WARMUP_RATIO);
}
static void bar1(byte[] src, char[] dst) {
for (int i=0; i<src.length; i++)
dst[i] = decode1(src[i]);
}
static void bar2(byte[] src, char[] dst) {
for (int i=0; i<src.length; i++)
dst[i] = decode2(src[i]);
}
static void bar3(byte[] src, char[] dst) {
for (int i=0; i<src.length; i++)
dst[i] = map[src[i] & 0xFF];
}
static void bar4(byte[] src, char[] dst) {
for (int i=0; i<src.length; i++)
dst[i] = map[src[i] + 0x80];
}
public static char decode1(byte a) {
return map[a & 0xFF];
}
public static char decode2(byte a) {
return map[a + 0x80];
}
public static void main(String[] args) {
DecoderBenchmark dbm = new DecoderBenchmark();
dbm.foo();
}
}
---------- END SOURCE ----------
because unsigned byte load is not as fast as it could be, decoders from sun.nio.cs package use the +128 trick (See source code section).
See discussion on hotspot-compiler-dev mailing list.
I'm working on speeding up charset de-/encoders:
https://java-nio-charset-enhanced.dev.java.net/
To finally decide for the fastest solution, this RFE should be solved.
JUSTIFICATION :
From CPU side of view, loading an unsigned byte should be as fast as or faster than signed loading + adding 128.
---------- BEGIN SOURCE ----------
import org.junit.*;
/**
*
* @author Ulf Zibis <Ulf.Zibis at CoSoCo.de>
*/
public class DecoderBenchmark {
static final char[] map = new char[256];
static final byte[] src = new byte[131072]; // exceed CPU L1-cache for real world constraint
static final char[] dst = new char[131072]; // exceed CPU L1-cache for real world constraint
static final int LOOPS = 125;
// static final byte[] src = new byte[16384]; // don't exceed CPU L1-cache
// static final char[] dst = new char[16384]; // don't exceed CPU L1-cache
// static final int LOOPS = 1000;
// static final byte[] src = new byte[2048]; // far below of exceeding CPU L1-cache
// static final char[] dst = new char[2048]; // far below of exceeding CPU L1-cache
// static final int LOOPS = 8000;
static final int OUTER_LOOPS = 100;
static final int WARMUP_LOOPS = 4;
static final float WARMUP_RATIO = 0.25f;
@Test
public void foo() {
// fill arrays, to force real memory load and prohibit HotSpot from just incrementing
// (maybe candidate for sophisticated HotSpot optimization ;-) )
for (int i=0; i<map.length; i++)
map[i] = (char)(59 * (227 - i));
for (int i=0; i<src.length; i++)
src[i] = (byte)(13 * (17 - i));
// warm up:
long time = System.nanoTime();
long lastWarmUpTime = 0;
for (int h=0; h<WARMUP_LOOPS; ) {
for (int i=0; i<WARMUP_RATIO*OUTER_LOOPS; i++) {
for (int j=0; j<LOOPS; j++)
bar1(src, dst);
for (int j=0; j<LOOPS; j++)
bar2(src, dst);
for (int j=0; j<LOOPS; j++)
bar3(src, dst);
for (int j=0; j<LOOPS; j++)
bar4(src, dst);
}
lastWarmUpTime = System.nanoTime()-time;
System.out.println("time for warm up "+(++h)+": "+(lastWarmUpTime)/1000000+" ms");
time = System.nanoTime();// don't count time for print ;-)
}
long time1 = 0;
long time2 = 0;
long time3 = 0;
long time4 = 0;
// swap decoders to eliminate influence of
// other processes and CPU clockdown, caused by overheating
for (int i=0; i<OUTER_LOOPS; i++) {
for (int j=0; j<LOOPS; j++)
bar1(src, dst);
time1 -= time - (time = System.nanoTime());
for (int j=0; j<LOOPS; j++)
bar2(src, dst);
time2 -= time - (time = System.nanoTime());
for (int j=0; j<LOOPS; j++)
bar3(src, dst);
time3 -= time - (time = System.nanoTime());
for (int j=0; j<LOOPS; j++)
bar4(src, dst);
time4 -= time - (time = System.nanoTime());
}
System.out.println("time for map[a & 0xFF]: "+time1/1000000+" ms");
System.out.println("time for map[a + 0x80]: "+time2/1000000+" ms");
System.out.println("time for inlined map[a & 0xFF]: "+time3/1000000+" ms");
System.out.println("time for inlined map[a + 0x80]: "+time4/1000000+" ms");
System.out.println("last warm up ./. test loops: "
+(float)lastWarmUpTime/(time1+time2+time3+time4)/WARMUP_RATIO);
}
static void bar1(byte[] src, char[] dst) {
for (int i=0; i<src.length; i++)
dst[i] = decode1(src[i]);
}
static void bar2(byte[] src, char[] dst) {
for (int i=0; i<src.length; i++)
dst[i] = decode2(src[i]);
}
static void bar3(byte[] src, char[] dst) {
for (int i=0; i<src.length; i++)
dst[i] = map[src[i] & 0xFF];
}
static void bar4(byte[] src, char[] dst) {
for (int i=0; i<src.length; i++)
dst[i] = map[src[i] + 0x80];
}
public static char decode1(byte a) {
return map[a & 0xFF];
}
public static char decode2(byte a) {
return map[a + 0x80];
}
public static void main(String[] args) {
DecoderBenchmark dbm = new DecoderBenchmark();
dbm.foo();
}
}
---------- END SOURCE ----------
- duplicates
-
JDK-6797305 Add LoadUB and LoadUI opcode class
-
- Resolved
-