adb: win32: handle incomplete UTF-8 in console output, other fixes
Previously, the various adb_printf, adb_fwrite, etc. functions did not
correctly handle the case of the passed buffer ending with an incomplete
UTF-8 sequence. This is fixed by buffering up incomplete UTF-8 sequences
in g_console_output_buffer (protected by the mutex
g_console_output_buffer) and outputting it later once the full sequence
is available.
A unittest for the main worker function, ParseCompleteUTF8(), was added
to adb_test.
Other fixes:
- Fix the return value of number-of-chars written to be number of UTF-8
bytes instead of number of UTF-16 characters.
- Don't overwrite errno in success cases of various adb_printf, etc.
functions. This might be excessive, but might be useful in the case
when these functions are used for debugging/tracing.
- Add missing UTF-8 stdio functions that aren't currently used by adb,
but might be in the future: vprintf, putc, putchar, puts.
- stdin_raw_init: If we can't get the console handle, don't call
SetConsoleMode(). Not a big deal, but this will prevent erroneous
trace output.
Change-Id: I8730e8af92882c42b884ad921b39a17b54465085
Signed-off-by: Spencer Low <CompareAndSwap@gmail.com>
diff --git a/adb/sysdeps_win32.cpp b/adb/sysdeps_win32.cpp
index c3889b6..1f6ea3f 100644
--- a/adb/sysdeps_win32.cpp
+++ b/adb/sysdeps_win32.cpp
@@ -2483,12 +2483,15 @@
}
+static adb_mutex_t g_console_output_buffer_lock;
+
void
adb_sysdeps_init( void )
{
#define ADB_MUTEX(x) InitializeCriticalSection( & x );
#include "mutex_list.h"
InitializeCriticalSection( &_win32_lock );
+ InitializeCriticalSection( &g_console_output_buffer_lock );
}
/**************************************************************************/
@@ -2557,6 +2560,8 @@
// Returns a console handle if |stream| is a console, otherwise returns nullptr.
static HANDLE _get_console_handle(FILE* const stream) {
+ // Save and restore errno to make it easier for callers to prevent from overwriting errno.
+ android::base::ErrnoRestorer er;
const int fd = fileno(stream);
if (fd < 0) {
return nullptr;
@@ -3316,6 +3321,9 @@
void stdin_raw_init() {
const HANDLE in = _get_console_handle(STDIN_FILENO, &_old_console_mode);
+ if (in == nullptr) {
+ return;
+ }
// Disable ENABLE_PROCESSED_INPUT so that Ctrl-C is read instead of
// calling the process Ctrl-C routine (configured by
@@ -3633,16 +3641,98 @@
return _wchmod(path_wide.c_str(), mode);
}
-// Internal helper function to write UTF-8 bytes to a console. Returns -1
-// on error.
-static int _console_write_utf8(const char* buf, size_t size, FILE* stream,
- HANDLE console) {
- std::wstring output;
+// From libutils/Unicode.cpp, get the length of a UTF-8 sequence given the lead byte.
+static inline size_t utf8_codepoint_len(uint8_t ch) {
+ return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
+}
- // Try to convert from data that might be UTF-8 to UTF-16, ignoring errors.
- // Data might not be UTF-8 if the user cat's random data, runs dmesg, etc.
+namespace internal {
+
+// Given a sequence of UTF-8 bytes (denoted by the range [first, last)), return the number of bytes
+// (from the beginning) that are complete UTF-8 sequences and append the remaining bytes to
+// remaining_bytes.
+size_t ParseCompleteUTF8(const char* const first, const char* const last,
+ std::vector<char>* const remaining_bytes) {
+ // Walk backwards from the end of the sequence looking for the beginning of a UTF-8 sequence.
+ // Current_after points one byte past the current byte to be examined.
+ for (const char* current_after = last; current_after != first; --current_after) {
+ const char* const current = current_after - 1;
+ const char ch = *current;
+ const char kHighBit = 0x80u;
+ const char kTwoHighestBits = 0xC0u;
+ if ((ch & kHighBit) == 0) { // high bit not set
+ // The buffer ends with a one-byte UTF-8 sequence, possibly followed by invalid trailing
+ // bytes with no leading byte, so return the entire buffer.
+ break;
+ } else if ((ch & kTwoHighestBits) == kTwoHighestBits) { // top two highest bits set
+ // Lead byte in UTF-8 sequence, so check if we have all the bytes in the sequence.
+ const size_t bytes_available = last - current;
+ if (bytes_available < utf8_codepoint_len(ch)) {
+ // We don't have all the bytes in the UTF-8 sequence, so return all the bytes
+ // preceding the current incomplete UTF-8 sequence and append the remaining bytes
+ // to remaining_bytes.
+ remaining_bytes->insert(remaining_bytes->end(), current, last);
+ return current - first;
+ } else {
+ // The buffer ends with a complete UTF-8 sequence, possibly followed by invalid
+ // trailing bytes with no lead byte, so return the entire buffer.
+ break;
+ }
+ } else {
+ // Trailing byte, so keep going backwards looking for the lead byte.
+ }
+ }
+
+ // Return the size of the entire buffer. It is possible that we walked backward past invalid
+ // trailing bytes with no lead byte, in which case we want to return all those invalid bytes
+ // so that they can be processed.
+ return last - first;
+}
+
+}
+
+// Bytes that have not yet been output to the console because they are incomplete UTF-8 sequences.
+// Note that we use only one buffer even though stderr and stdout are logically separate streams.
+// This matches the behavior of Linux.
+// Protected by g_console_output_buffer_lock.
+static auto& g_console_output_buffer = *new std::vector<char>();
+
+// Internal helper function to write UTF-8 bytes to a console. Returns -1 on error.
+static int _console_write_utf8(const char* const buf, const size_t buf_size, FILE* stream,
+ HANDLE console) {
+ const int saved_errno = errno;
+ std::vector<char> combined_buffer;
+
+ // Complete UTF-8 sequences that should be immediately written to the console.
+ const char* utf8;
+ size_t utf8_size;
+
+ adb_mutex_lock(&g_console_output_buffer_lock);
+ if (g_console_output_buffer.empty()) {
+ // If g_console_output_buffer doesn't have a buffered up incomplete UTF-8 sequence (the
+ // common case with plain ASCII), parse buf directly.
+ utf8 = buf;
+ utf8_size = internal::ParseCompleteUTF8(buf, buf + buf_size, &g_console_output_buffer);
+ } else {
+ // If g_console_output_buffer has a buffered up incomplete UTF-8 sequence, move it to
+ // combined_buffer (and effectively clear g_console_output_buffer) and append buf to
+ // combined_buffer, then parse it all together.
+ combined_buffer.swap(g_console_output_buffer);
+ combined_buffer.insert(combined_buffer.end(), buf, buf + buf_size);
+
+ utf8 = combined_buffer.data();
+ utf8_size = internal::ParseCompleteUTF8(utf8, utf8 + combined_buffer.size(),
+ &g_console_output_buffer);
+ }
+ adb_mutex_unlock(&g_console_output_buffer_lock);
+
+ std::wstring utf16;
+
+ // Try to convert from data that might be UTF-8 to UTF-16, ignoring errors (just like Linux
+ // which does not return an error on bad UTF-8). Data might not be UTF-8 if the user cat's
+ // random data, runs dmesg (which might have non-UTF-8), etc.
// This could throw std::bad_alloc.
- (void)android::base::UTF8ToWide(buf, size, &output);
+ (void)android::base::UTF8ToWide(utf8, utf8_size, &utf16);
// Note that this does not do \n => \r\n translation because that
// doesn't seem necessary for the Windows console. For the Windows
@@ -3655,16 +3745,16 @@
// Write UTF-16 to the console.
DWORD written = 0;
- if (!WriteConsoleW(console, output.c_str(), output.length(), &written,
- NULL)) {
+ if (!WriteConsoleW(console, utf16.c_str(), utf16.length(), &written, NULL)) {
errno = EIO;
return -1;
}
- // This is the number of UTF-16 chars written, which might be different
- // than the number of UTF-8 chars passed in. It doesn't seem practical to
- // get this count correct.
- return written;
+ // Return the size of the original buffer passed in, signifying that we consumed it all, even
+ // if nothing was displayed, in the case of being passed an incomplete UTF-8 sequence. This
+ // matches the Linux behavior.
+ errno = saved_errno;
+ return buf_size;
}
// Function prototype because attributes cannot be placed on func definitions.
@@ -3676,14 +3766,21 @@
// Returns -1 on error.
static int _console_vfprintf(const HANDLE console, FILE* stream,
const char *format, va_list ap) {
+ const int saved_errno = errno;
std::string output_utf8;
// Format the string.
// This could throw std::bad_alloc.
android::base::StringAppendV(&output_utf8, format, ap);
- return _console_write_utf8(output_utf8.c_str(), output_utf8.length(),
- stream, console);
+ const int result = _console_write_utf8(output_utf8.c_str(), output_utf8.length(), stream,
+ console);
+ if (result != -1) {
+ errno = saved_errno;
+ } else {
+ // If -1 was returned, errno has been set.
+ }
+ return result;
}
// Version of vfprintf() that takes UTF-8 and can write Unicode to a
@@ -3705,6 +3802,11 @@
}
}
+// Version of vprintf() that takes UTF-8 and can write Unicode to a Windows console.
+int adb_vprintf(const char *format, va_list ap) {
+ return adb_vfprintf(stdout, format, ap);
+}
+
// Version of fprintf() that takes UTF-8 and can write Unicode to a
// Windows console.
int adb_fprintf(FILE *stream, const char *format, ...) {
@@ -3732,6 +3834,7 @@
int adb_fputs(const char* buf, FILE* stream) {
// adb_fprintf returns -1 on error, which is conveniently the same as EOF
// which fputs (and hence adb_fputs) should return on error.
+ static_assert(EOF == -1, "EOF is not -1, so this code needs to be fixed");
return adb_fprintf(stream, "%s", buf);
}
@@ -3739,32 +3842,32 @@
// Windows console.
int adb_fputc(int ch, FILE* stream) {
const int result = adb_fprintf(stream, "%c", ch);
- if (result <= 0) {
- // If there was an error, or if nothing was printed (which should be an
- // error), return an error, which fprintf signifies with EOF.
+ if (result == -1) {
return EOF;
}
// For success, fputc returns the char, cast to unsigned char, then to int.
return static_cast<unsigned char>(ch);
}
+// Version of putchar() that takes UTF-8 and can write Unicode to a Windows console.
+int adb_putchar(int ch) {
+ return adb_fputc(ch, stdout);
+}
+
+// Version of puts() that takes UTF-8 and can write Unicode to a Windows console.
+int adb_puts(const char* buf) {
+ // adb_printf returns -1 on error, which is conveniently the same as EOF
+ // which puts (and hence adb_puts) should return on error.
+ static_assert(EOF == -1, "EOF is not -1, so this code needs to be fixed");
+ return adb_printf("%s\n", buf);
+}
+
// Internal function to write UTF-8 to a Win32 console. Returns the number of
// items (of length size) written. On error, returns a short item count or 0.
static size_t _console_fwrite(const void* ptr, size_t size, size_t nmemb,
FILE* stream, HANDLE console) {
- // TODO: Note that a Unicode character could be several UTF-8 bytes. But
- // if we're passed only some of the bytes of a character (for example, from
- // the network socket for adb shell), we won't be able to convert the char
- // to a complete UTF-16 char (or surrogate pair), so the output won't look
- // right.
- //
- // To fix this, see libutils/Unicode.cpp for hints on decoding UTF-8.
- //
- // For now we ignore this problem because the alternative is that we'd have
- // to parse UTF-8 and buffer things up (doable). At least this is better
- // than what we had before -- always incorrect multi-byte UTF-8 output.
- int result = _console_write_utf8(reinterpret_cast<const char*>(ptr),
- size * nmemb, stream, console);
+ const int result = _console_write_utf8(reinterpret_cast<const char*>(ptr), size * nmemb, stream,
+ console);
if (result == -1) {
return 0;
}