diff options
Diffstat (limited to 'test')
-rw-r--r--[-rwxr-xr-x] | test/Makefile | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | test/chard7.txt | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | test/chardos.txt | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | test/charu7.txt | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | test/dos_bin.txt | bin | 144 -> 144 bytes | |||
-rw-r--r--[-rwxr-xr-x] | test/gb18030.txt | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | test/gb18030b.txt | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | test/gb18030u.txt | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | test/invalhig.txt | bin | 30 -> 30 bytes | |||
-rw-r--r--[-rwxr-xr-x] | test/invallow.txt | bin | 30 -> 30 bytes | |||
-rw-r--r-- | test/setlocale.c | 89 | ||||
-rw-r--r-- | test/setlocale.png | bin | 0 -> 18506 bytes | |||
-rwxr-xr-x | test/testcmd.bat | 2 | ||||
-rwxr-xr-x | test/testps16.ps1 | bin | 846 -> 840 bytes | |||
-rwxr-xr-x | test/testps8.ps1 | 7 | ||||
-rw-r--r--[-rwxr-xr-x] | test/testu16.c | 37 | ||||
-rw-r--r--[-rwxr-xr-x] | test/uni_nl_ëäï.txt | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | test/unix_bin.txt | bin | 136 -> 136 bytes | |||
-rw-r--r--[-rwxr-xr-x] | test/utf16.txt | bin | 66 -> 66 bytes | |||
-rw-r--r--[-rwxr-xr-x] | test/utf16be.txt | bin | 1926 -> 1926 bytes | |||
-rw-r--r--[-rwxr-xr-x] | test/utf16bin.txt | bin | 290 -> 290 bytes | |||
-rw-r--r--[-rwxr-xr-x] | test/utf16le.txt | bin | 1926 -> 1926 bytes | |||
-rw-r--r--[-rwxr-xr-x] | test/utf16m.txt | bin | 54 -> 54 bytes | |||
-rw-r--r--[-rwxr-xr-x] | test/utf16u.txt | bin | 54 -> 54 bytes | |||
-rw-r--r--[-rwxr-xr-x] | test/utf8dos.txt | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | test/utf8dosn.txt | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | test/utf8unix.txt | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | test/utf8unxb.txt | 0 |
28 files changed, 105 insertions, 30 deletions
diff --git a/test/Makefile b/test/Makefile index e2a55e9..e2a55e9 100755..100644 --- a/test/Makefile +++ b/test/Makefile diff --git a/test/chard7.txt b/test/chard7.txt index 3b3c9fc..3b3c9fc 100755..100644 --- a/test/chard7.txt +++ b/test/chard7.txt diff --git a/test/chardos.txt b/test/chardos.txt index ff85358..ff85358 100755..100644 --- a/test/chardos.txt +++ b/test/chardos.txt diff --git a/test/charu7.txt b/test/charu7.txt index 0073580..0073580 100755..100644 --- a/test/charu7.txt +++ b/test/charu7.txt diff --git a/test/dos_bin.txt b/test/dos_bin.txt Binary files differindex 82c9864..82c9864 100755..100644 --- a/test/dos_bin.txt +++ b/test/dos_bin.txt diff --git a/test/gb18030.txt b/test/gb18030.txt index 9e294aa..9e294aa 100755..100644 --- a/test/gb18030.txt +++ b/test/gb18030.txt diff --git a/test/gb18030b.txt b/test/gb18030b.txt index 2b08b78..2b08b78 100755..100644 --- a/test/gb18030b.txt +++ b/test/gb18030b.txt diff --git a/test/gb18030u.txt b/test/gb18030u.txt index 7687092..7687092 100755..100644 --- a/test/gb18030u.txt +++ b/test/gb18030u.txt diff --git a/test/invalhig.txt b/test/invalhig.txt Binary files differindex da9ce0f..da9ce0f 100755..100644 --- a/test/invalhig.txt +++ b/test/invalhig.txt diff --git a/test/invallow.txt b/test/invallow.txt Binary files differindex c870871..c870871 100755..100644 --- a/test/invallow.txt +++ b/test/invallow.txt diff --git a/test/setlocale.c b/test/setlocale.c new file mode 100644 index 0000000..005f0fb --- /dev/null +++ b/test/setlocale.c @@ -0,0 +1,89 @@ +#include <stdio.h>
+#include <fcntl.h>
+#include <windows.h>
+#include <locale.h>
+
+/*
+
+ This program proves that when you set the locale to "", the Chinese ANSI CP936
+ encoded text is printed wrongly in a simplified Chinese regional setting.
+ UTF-8 is also printed wrongly.
+
+To test this program you first need to change the Windows locale setting to
+simplified Chinese. There is no problem doing that, because the "display
+language" will stay the original language, and you can still use your Windows.
+
+Control Panel > Region and Language > Administrative > Change system locale
+
+Select simplified Chinese and reboot your PC.
+
+For example output see setlocale.png.
+
+ */
+
+print_string(const char* str, const char* ustr, const wchar_t *wstr) {
+ int prevmode;
+ UINT outputCP;
+ int utf8 = 0;
+
+ /* Set utf8 to 1 to print UTF-8 text.
+ If you print both UTF-16 en UTF-8 in one program the console gets
+ mixed up. When the UTF-8 is printed after UTF-16 the UTF-16 text
+ is displayed wrongly.
+ */
+
+ if ( ! utf8 ) {
+ /* When the locale is set to "" the following line will produce wrong output */
+ printf ("ANSI CP936 %s\n",str);
+
+ /* UTF-16 will produce correct output in all cases. */
+ prevmode = _setmode(_fileno(stdout), _O_U16TEXT);
+ wprintf(L"UTF-16 %ls\n",wstr);
+ _setmode(_fileno(stdout), prevmode);
+ } else {
+
+ /* UTF-8 will produce wrong output when the locale is "".
+ When the locale is "C" wrong output with raster font, and correct output
+ with TrueType font. */
+ outputCP = GetConsoleOutputCP();
+ SetConsoleOutputCP(CP_UTF8);
+ wprintf(L"UTF-8 %S\n",ustr);
+ SetConsoleOutputCP(outputCP);
+
+ /* The code below produces wrong output in all cases */
+ // prevmode = _setmode(_fileno(stdout), _O_U8TEXT);
+ // wprintf(L"UTF-8 %S",ustr);
+ // _setmode(_fileno(stdout), prevmode);
+ }
+
+}
+
+int main() {
+
+ char str[5]; /* ANSI CP936 */
+ char ustr[15]; /* UTF-8 */
+ wchar_t wstr[10]; /* UTF-16 */
+
+/* Create ANSI CP936 string (meaning: Western-European). */
+ str[0] = 0xce;
+ str[1] = 0xf7;
+ str[2] = 0xc5;
+ str[3] = 0xb7;
+ str[4] = '\0';
+/* Convert CP936 to UTF-16. */
+ MultiByteToWideChar(936, 0, str, -1, wstr, sizeof(wstr));
+/* Convert UTF-16 to UTF-8 */
+ WideCharToMultiByte(CP_UTF8, 0, wstr, -1, ustr, sizeof(ustr), NULL, NULL);
+
+
+
+ setlocale (LC_ALL, "");
+ printf("==> setlocale (LC_ALL, \"\");\n");
+ print_string(str, ustr, wstr);
+
+ setlocale (LC_ALL, "C");
+ printf("\n==> setlocale (LC_ALL, \"C\");\n");
+ print_string(str, ustr, wstr);
+
+ return 0;
+}
diff --git a/test/setlocale.png b/test/setlocale.png Binary files differnew file mode 100644 index 0000000..9920216 --- /dev/null +++ b/test/setlocale.png diff --git a/test/testcmd.bat b/test/testcmd.bat index 9431352..8acea08 100755 --- a/test/testcmd.bat +++ b/test/testcmd.bat @@ -8,7 +8,7 @@ REM This script will only run on Windows 7 and higher. REM switch to UTF-8 code page
chcp 65001
-dos2unix -i uni_el_αρχείο.txt uni_zh_文件.txt
+dos2unix -D unicode -i uni_el_αρχείο.txt uni_zh_文件.txt
REM set code page back to original value
chcp 850
diff --git a/test/testps16.ps1 b/test/testps16.ps1 Binary files differindex 059ec03..c927442 100755 --- a/test/testps16.ps1 +++ b/test/testps16.ps1 diff --git a/test/testps8.ps1 b/test/testps8.ps1 index 7bf72db..dc2bcfe 100755 --- a/test/testps8.ps1 +++ b/test/testps8.ps1 @@ -8,5 +8,8 @@ dos2unix -i uni_el_αρχείο.txt uni_zh_文件.txt echo "test select-string:"
-# The following is not working. Why?
-dos2unix -i uni* | select-string -encoding utf8 -pattern αρχ
\ No newline at end of file +# select-string requires a BOM.
+$env:DOS2UNIX_DISPLAY_ENC = "utf8bom"
+dos2unix -i uni* | select-string -encoding utf8 -pattern αρχ
+
+$env:DOS2UNIX_DISPLAY_ENC = ""
diff --git a/test/testu16.c b/test/testu16.c index 8ed706a..d0a8f9e 100755..100644 --- a/test/testu16.c +++ b/test/testu16.c @@ -3,44 +3,27 @@ #include <fcntl.h>
#include <io.h>
-/* This program demonstrates that when Unicode printed text is redirected to
- a file, the file is not in correct UTF-16.
+/* This program demonstrates Unicode UTF-16 printed text redirected to a
+ correct UTF-16 file.
+
+ .\testu16.exe > out.txt
+
*/
int main () {
int prevmode;
-/*
- When the output of this program is redirected to a file in Windows Command
- Prompt you get for line breaks 0d0a 00, while it should be 0d00 0a00.
-
-c:\test>.\testu16.exe > o.txt
-
-c:\test>xxd o.txt
-0000000: 6f00 6e00 6500 0d0a 0074 0077 006f 000d o.n.e....t.w.o..
-0000010: 0a00 7400 6800 7200 6500 6500 0d0a 00 ..t.h.r.e.e....
-
-
-
- When the output is redirected to a file in PowerShell, null characters
- 0000 are inserted. It looks like UTF-32 with an UTF-16 BOM and UTF-16 line
- breaks.
-
-PS C:\test> .\testu16.exe > p.txt
-PS C:\test> xxd p.txt
-0000000: fffe 6f00 0000 6e00 0000 6500 0000 0d00 ..o...n...e.....
-0000010: 0a00 0000 7400 0000 7700 0000 6f00 0000 ....t...w...o...
-0000020: 0d00 0a00 0000 7400 0000 6800 0000 7200 ......t...h...r.
-0000030: 0000 6500 0000 6500 0000 0d00 0a00 0000 ..e...e.........
-0000040: 0d00 0a00 ....
-
-*/
prevmode = _setmode(_fileno(stdout), _O_U16TEXT);
+ /* We need to print an UTF-16 BOM for correct redirection in PowerShell. */
+ fwprintf(stdout, L"\xfeff");
fwprintf(stdout,L"one\n");
fwprintf(stdout,L"two\n");
fwprintf(stdout,L"three\n");
+ /* Flushing stdout is required to get correct UTF-16.
+ This is required for both CMD.exe and PowerShell. */
+ fflush(stdout);
_setmode(_fileno(stdout), prevmode);
diff --git a/test/uni_nl_ëäï.txt b/test/uni_nl_ëäï.txt index a9cd879..a9cd879 100755..100644 --- a/test/uni_nl_ëäï.txt +++ b/test/uni_nl_ëäï.txt diff --git a/test/unix_bin.txt b/test/unix_bin.txt Binary files differindex b2cafbc..b2cafbc 100755..100644 --- a/test/unix_bin.txt +++ b/test/unix_bin.txt diff --git a/test/utf16.txt b/test/utf16.txt Binary files differindex 868b5af..868b5af 100755..100644 --- a/test/utf16.txt +++ b/test/utf16.txt diff --git a/test/utf16be.txt b/test/utf16be.txt Binary files differindex 1be5db9..1be5db9 100755..100644 --- a/test/utf16be.txt +++ b/test/utf16be.txt diff --git a/test/utf16bin.txt b/test/utf16bin.txt Binary files differindex 6c4216b..6c4216b 100755..100644 --- a/test/utf16bin.txt +++ b/test/utf16bin.txt diff --git a/test/utf16le.txt b/test/utf16le.txt Binary files differindex db9f535..db9f535 100755..100644 --- a/test/utf16le.txt +++ b/test/utf16le.txt diff --git a/test/utf16m.txt b/test/utf16m.txt Binary files differindex cd523f8..cd523f8 100755..100644 --- a/test/utf16m.txt +++ b/test/utf16m.txt diff --git a/test/utf16u.txt b/test/utf16u.txt Binary files differindex 2bd9b3b..2bd9b3b 100755..100644 --- a/test/utf16u.txt +++ b/test/utf16u.txt diff --git a/test/utf8dos.txt b/test/utf8dos.txt index 76afa4f..76afa4f 100755..100644 --- a/test/utf8dos.txt +++ b/test/utf8dos.txt diff --git a/test/utf8dosn.txt b/test/utf8dosn.txt index 154aa88..154aa88 100755..100644 --- a/test/utf8dosn.txt +++ b/test/utf8dosn.txt diff --git a/test/utf8unix.txt b/test/utf8unix.txt index 8b93c0c..8b93c0c 100755..100644 --- a/test/utf8unix.txt +++ b/test/utf8unix.txt diff --git a/test/utf8unxb.txt b/test/utf8unxb.txt index f670e9b..f670e9b 100755..100644 --- a/test/utf8unxb.txt +++ b/test/utf8unxb.txt |