public class CharEncodingTest
{
@Test
public void testUTF8CharReadAsCP1252Char() throws Exception
{
XMLVorgangsParser parser = new XMLVorgangsParser();
/*
* RIGHT DOUBLE QUOTATION MARK -> UTF-8 Hex Bytes E2 80 9D
*/
String charUnderTest = "”";
byte[] charUnderTestInUTF8 = StandardCharsets.UTF_8.encode(charUnderTest).array();
assertTrue(bytesAsHex(charUnderTestInUTF8).equals("00e20080009d"));
File multibytesFromFilterRegion = new File("c:\\tmp\\multibytesFromFilterRegion.txt");
byte[] newlineBytes = StandardCharsets.UTF_8.encode(System.lineSeparator()).array();
FileUtils.writeByteArrayToFile(multibytesFromFilterRegion, charUnderTestInUTF8, true);
FileUtils.writeByteArrayToFile(multibytesFromFilterRegion, newlineBytes, true);
String input = readLine(multibytesFromFilterRegion, "UTF-8");
assertTrue(input.equals(charUnderTest));
assertEquals(input, parser.preprocessAndSanitizeLine(input));
input = readLine(multibytesFromFilterRegion, Charset.defaultCharset().toString());
System.out.println("parser.preprocessAndSanitizeLine(input): " + parser.preprocessAndSanitizeLine(input));
assertEquals(input, parser.preprocessAndSanitizeLine(input));
assertFalse(input.equals(charUnderTest));
boolean deleteSuccess = multibytesFromFilterRegion.delete();
assertTrue(deleteSuccess);
}
private String readLine(final File multibytesFromFilterRegion, final String charSet) throws FileNotFoundException, UnsupportedEncodingException, IOException
{
FileInputStream fis = new FileInputStream(multibytesFromFilterRegion);
InputStreamReader isr = new InputStreamReader(fis, charSet);
BufferedReader br = new BufferedReader(isr);
String input = br.readLine();
br.close();
return input;
}
private String bytesAsHex(final byte[] ba)
{
StringBuilder hexStringBuilder = new StringBuilder();
for (int i = 0; i < ba.length; i++)
{
String hex = String.format("%04x", ba[i]);
hexStringBuilder.append(hex);
}
return hexStringBuilder.toString();
}
@Test
public void testSizeOfPrimitiveTypes() throws Exception
{
System.out.println("Size of byte: " + (Byte.SIZE / 8) + " bytes.");
System.out.println("Size of short: " + (Short.SIZE / 8) + " bytes.");
System.out.println("Size of int: " + (Integer.SIZE / 8) + " bytes.");
System.out.println("Size of long: " + (Long.SIZE / 8) + " bytes.");
System.out.println("Size of char: " + (Character.SIZE / 8) + " bytes.");
System.out.println("Size of float: " + (Float.SIZE / 8) + " bytes.");
System.out.println("Size of double: " + (Double.SIZE / 8) + " bytes.");
}
@Test
public void testMultibyteCharsReaderAndWriterHasCorrectEncoding() throws Exception
{
PrintStream utf8out = new PrintStream(System.out, true, "UTF-8");
List<String> characters = new ArrayList<>();
List<byte[]> charactersUtf8Bytes = new ArrayList<>();
char currentChar = 0;
int charsetSize = 536;
for (int i = 0; i < charsetSize; i++)
{
String currentCharAsString = String.valueOf(currentChar);
characters.add(currentCharAsString);
charactersUtf8Bytes.add(StandardCharsets.UTF_8.encode(currentCharAsString).array());
utf8out.print(currentChar);
if (i % 80 == 0)
{
utf8out.print(System.lineSeparator());
}
currentChar++;
}
utf8out.print(System.lineSeparator());
System.out.println(characters.size() + " chars created");
char greaterThanValue = 0xD7FF;
char lessThanValue = 0xE000;
System.out.println("char filter ranges from " + (greaterThanValue + 1) + " to " + (lessThanValue - 1));
List<byte[]> multibyteCharactersStartingWithCharFromFilteredRegion = new ArrayList<>();
for (byte[] ca : charactersUtf8Bytes)
{
if (ca.length > 1)
{
currentChar = concatBytesToChar(ca);
}
if (currentChar > greaterThanValue && currentChar < lessThanValue)
{
multibyteCharactersStartingWithCharFromFilteredRegion.add(ca);
}
}
System.out.println(multibyteCharactersStartingWithCharFromFilteredRegion.size() + " multibyte chars in filtered region:");
for (byte[] ba : multibyteCharactersStartingWithCharFromFilteredRegion)
{
printFirstCharAsHex(ba);
utf8out.print(StandardCharsets.UTF_8.decode(ByteBuffer.wrap(ba)) + ",");
if (multibyteCharactersStartingWithCharFromFilteredRegion.indexOf(ba) % 6 == 0)
{
System.out.print(System.lineSeparator());
}
}
System.out.print(System.lineSeparator());
File multibytesFromFilterRegion = new File("c:\\tmp\\multibytesFromFilterRegion.txt");
byte[] newlineBytes = StandardCharsets.UTF_8.encode(System.lineSeparator()).array();
int windowSize = 12;
if (!multibyteCharactersStartingWithCharFromFilteredRegion.isEmpty() && multibyteCharactersStartingWithCharFromFilteredRegion.size() > 12)
{
for (int i = 0; i < multibyteCharactersStartingWithCharFromFilteredRegion.size() - 12; i++)
{
for (int j = 0; j < windowSize; j++)
{
FileUtils.writeByteArrayToFile(multibytesFromFilterRegion, multibyteCharactersStartingWithCharFromFilteredRegion.get(i + j), true);
}
FileUtils.writeByteArrayToFile(multibytesFromFilterRegion, newlineBytes, true);
}
}
if (multibytesFromFilterRegion != null)
{
FileInputStream fis = new FileInputStream(multibytesFromFilterRegion); //
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
BufferedReader br = new BufferedReader(isr);
XMLVorgangsParser parser = new XMLVorgangsParser();
String input = br.readLine();
while (input != null)
{
try
{
assertEquals(input, parser.preprocessAndSanitizeLine(input));
}
catch (AssertionError e)
{
utf8out.println("failed on input " + input + " output " + parser.preprocessAndSanitizeLine(input));
utf8out.println("unknown bytes ignored: " + decodeText(input, StandardCharset.UTF_8, CodingErrorAction.IGNORE));
}
input = br.readLine();
}
br.close();
}
boolean success = multibytesFromFilterRegion.delete();
assertTrue(success);
}
private void printFirstCharAsHex(final byte[] ba)
{
System.out.print("[B@");
char concatBytesToChar = concatBytesToChar(ba);
int concatBytesToInt = concatBytesToChar;
String hex = String.format("%04x", concatBytesToInt);
System.out.print(hex);
System.out.print("]");
}
private char concatBytesToChar(final byte[] ca)
{
char currentChar;
currentChar = (char) (ca[0] << 8);
currentChar += ca[1];
return currentChar;
}
@Test
public void testByteShiftBitwise() throws Exception
{
char value = 0xd8;
char target = 0;
int number = target;
for (int i = 0; i < 9; i++)
{
System.out.print(value + " shifted " + i + " times: ");
target = (char) (value << i);
number = target;
String hex = String.format("%04x", number);
System.out.println("\t" + hex + "\t" + target + "\t" + number);
}
}
String decodeText(final String input, final Charset charset, final CodingErrorAction codingErrorAction) throws IOException
{
CharsetDecoder charsetDecoder = charset.newDecoder();
charsetDecoder.onMalformedInput(codingErrorAction);
return new BufferedReader(new InputStreamReader(new ByteArrayInputStream(input.getBytes(charset)), charsetDecoder)).readLine();
}
}