Blame - libdexfile/dex/utf.cc - LeafOS-Project/android_art

blob: bfc704d4a6954034d8b5d378bf51256b8eb00b74 [file] [log] [blame]

Elliott Hughes	2faa5f1	2012-01-30 14:42:07 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	16
				17	#include "utf.h"
				18
Andreas Gampe	5794381	2017-12-06 21:39:13 -0800	[diff] [blame]	19	#include <android-base/logging.h>
David Sehr	0225f8e	2018-01-31 08:52:24 +0000	[diff] [blame]	20	#include <android-base/stringprintf.h>
				21	#include <android-base/strings.h>
Andreas Gampe	5794381	2017-12-06 21:39:13 -0800	[diff] [blame]	22
David Sehr	8c0961f	2018-01-23 16:11:38 -0800	[diff] [blame]	23	#include "base/casts.h"
Ian Rogers	a672490	2013-09-23 09:23:37 -0700	[diff] [blame]	24	#include "utf-inl.h"
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	25
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	26	namespace art {
				27
David Sehr	0225f8e	2018-01-31 08:52:24 +0000	[diff] [blame]	28	using android::base::StringAppendF;
David Sehr	0225f8e	2018-01-31 08:52:24 +0000	[diff] [blame]	29
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	30	// This is used only from debugger and test code.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	31	size_t CountModifiedUtf8Chars(const char* utf8) {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	32	return CountModifiedUtf8Chars(utf8, strlen(utf8));
				33	}
				34
				35	/*
				36	* This does not validate UTF8 rules (nor did older code). But it gets the right answer
				37	* for valid UTF-8 and that's fine because it's used only to size a buffer for later
				38	* conversion.
				39	*
				40	* Modified UTF-8 consists of a series of bytes up to 21 bit Unicode code points as follows:
				41	* U+0001 - U+007F 0xxxxxxx
				42	* U+0080 - U+07FF 110xxxxx 10xxxxxx
				43	* U+0800 - U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
				44	* U+10000 - U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				45	*
				46	* U+0000 is encoded using the 2nd form to avoid nulls inside strings (this differs from
				47	* standard UTF-8).
				48	* The four byte encoding converts to two utf16 characters.
				49	*/
				50	size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count) {
				51	DCHECK_LE(byte_count, strlen(utf8));
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	52	size_t len = 0;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	53	const char* end = utf8 + byte_count;
				54	for (; utf8 < end; ++utf8) {
				55	int ic = *utf8;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	56	len++;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	57	if (LIKELY((ic & 0x80) == 0)) {
				58	// One-byte encoding.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	59	continue;
				60	}
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	61	// Two- or three-byte encoding.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	62	utf8++;
				63	if ((ic & 0x20) == 0) {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	64	// Two-byte encoding.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	65	continue;
				66	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	67	utf8++;
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	68	if ((ic & 0x10) == 0) {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	69	// Three-byte encoding.
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	70	continue;
				71	}
				72
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	73	// Four-byte encoding: needs to be converted into a surrogate
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	74	// pair.
				75	utf8++;
				76	len++;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	77	}
				78	return len;
				79	}
				80
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	81	// This is used only from debugger and test code.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	82	void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
				83	while (*utf8_data_in != '\0') {
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	84	const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
				85	const uint16_t leading = GetLeadingUtf16Char(ch);
				86	const uint16_t trailing = GetTrailingUtf16Char(ch);
				87
				88	*utf16_data_out++ = leading;
				89	if (trailing != 0) {
				90	*utf16_data_out++ = trailing;
				91	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	92	}
				93	}
				94
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	95	void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, size_t out_chars,
				96	const char* utf8_data_in, size_t in_bytes) {
				97	const char *in_start = utf8_data_in;
				98	const char *in_end = utf8_data_in + in_bytes;
				99	uint16_t *out_p = utf16_data_out;
				100
				101	if (LIKELY(out_chars == in_bytes)) {
				102	// Common case where all characters are ASCII.
				103	for (const char *p = in_start; p < in_end;) {
				104	// Safe even if char is signed because ASCII characters always have
				105	// the high bit cleared.
				106	out_p++ = dchecked_integral_cast<uint16_t>(p++);
				107	}
				108	return;
				109	}
				110
				111	// String contains non-ASCII characters.
				112	for (const char *p = in_start; p < in_end;) {
				113	const uint32_t ch = GetUtf16FromUtf8(&p);
				114	const uint16_t leading = GetLeadingUtf16Char(ch);
				115	const uint16_t trailing = GetTrailingUtf16Char(ch);
				116
				117	*out_p++ = leading;
				118	if (trailing != 0) {
				119	*out_p++ = trailing;
				120	}
				121	}
				122	}
				123
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	124	void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
				125	const uint16_t* utf16_in, size_t char_count) {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	126	if (LIKELY(byte_count == char_count)) {
				127	// Common case where all characters are ASCII.
				128	const uint16_t *utf16_end = utf16_in + char_count;
				129	for (const uint16_t *p = utf16_in; p < utf16_end;) {
				130	utf8_out++ = dchecked_integral_cast<char>(p++);
				131	}
				132	return;
				133	}
				134
				135	// String contains non-ASCII characters.
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	136	while (char_count--) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	137	const uint16_t ch = *utf16_in++;
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	138	if (ch > 0 && ch <= 0x7f) {
				139	*utf8_out++ = ch;
				140	} else {
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	141	// Char_count == 0 here implies we've encountered an unpaired
				142	// surrogate and we have no choice but to encode it as 3-byte UTF
				143	// sequence. Note that unpaired surrogates can occur as a part of
				144	// "normal" operation.
				145	if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
				146	const uint16_t ch2 = *utf16_in;
				147
				148	// Check if the other half of the pair is within the expected
				149	// range. If it isn't, we will have to emit both "halves" as
				150	// separate 3 byte sequences.
				151	if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
				152	utf16_in++;
				153	char_count--;
				154	const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
				155	*utf8_out++ = (code_point >> 18) \| 0xf0;
				156	*utf8_out++ = ((code_point >> 12) & 0x3f) \| 0x80;
				157	*utf8_out++ = ((code_point >> 6) & 0x3f) \| 0x80;
				158	*utf8_out++ = (code_point & 0x3f) \| 0x80;
				159	continue;
				160	}
				161	}
				162
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	163	if (ch > 0x07ff) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	164	// Three byte encoding.
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	165	*utf8_out++ = (ch >> 12) \| 0xe0;
				166	*utf8_out++ = ((ch >> 6) & 0x3f) \| 0x80;
				167	*utf8_out++ = (ch & 0x3f) \| 0x80;
				168	} else /(ch > 0x7f \|\| ch == 0)/ {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	169	// Two byte encoding.
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	170	*utf8_out++ = (ch >> 6) \| 0xc0;
				171	*utf8_out++ = (ch & 0x3f) \| 0x80;
				172	}
				173	}
				174	}
				175	}
				176
Vladimir Marko	cac5a7e	2016-02-22 10:39:50 +0000	[diff] [blame]	177	int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) {
				178	uint32_t hash = 0;
				179	while (utf16_length != 0u) {
				180	const uint32_t pair = GetUtf16FromUtf8(&utf8);
				181	const uint16_t first = GetLeadingUtf16Char(pair);
				182	hash = hash * 31 + first;
				183	--utf16_length;
				184	const uint16_t second = GetTrailingUtf16Char(pair);
				185	if (second != 0) {
				186	hash = hash * 31 + second;
				187	DCHECK_NE(utf16_length, 0u);
				188	--utf16_length;
				189	}
				190	}
				191	return static_cast<int32_t>(hash);
				192	}
				193
Mathieu Chartier	208a5cb	2015-12-02 15:44:07 -0800	[diff] [blame]	194	uint32_t ComputeModifiedUtf8Hash(const char* chars) {
Vladimir Marko	b4bd92f	2021-07-05 12:18:26 +0100	[diff] [blame]	195	uint32_t hash = StartModifiedUtf8Hash();
Ian Rogers	68b5685	2014-08-29 20:19:11 -0700	[diff] [blame]	196	while (*chars != '\0') {
Vladimir Marko	b4bd92f	2021-07-05 12:18:26 +0100	[diff] [blame]	197	hash = UpdateModifiedUtf8Hash(hash, *chars);
Vladimir Marko	ca0f2dc	2018-12-10 12:14:36 +0000	[diff] [blame]	198	++chars;
Ian Rogers	68b5685	2014-08-29 20:19:11 -0700	[diff] [blame]	199	}
Vladimir Marko	ca0f2dc	2018-12-10 12:14:36 +0000	[diff] [blame]	200	return hash;
Ian Rogers	68b5685	2014-08-29 20:19:11 -0700	[diff] [blame]	201	}
				202
Vladimir Marko	b4bd92f	2021-07-05 12:18:26 +0100	[diff] [blame]	203	uint32_t ComputeModifiedUtf8Hash(std::string_view chars) {
				204	return UpdateModifiedUtf8Hash(StartModifiedUtf8Hash(), chars);
				205	}
				206
Vladimir Marko	a48aef4	2014-12-03 17:53:53 +0000	[diff] [blame]	207	int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16,
				208	size_t utf16_length) {
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	209	for (;;) {
Vladimir Marko	a48aef4	2014-12-03 17:53:53 +0000	[diff] [blame]	210	if (*utf8 == '\0') {
				211	return (utf16_length == 0) ? 0 : -1;
				212	} else if (utf16_length == 0) {
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	213	return 1;
				214	}
				215
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	216	const uint32_t pair = GetUtf16FromUtf8(&utf8);
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	217
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	218	// First compare the leading utf16 char.
				219	const uint16_t lhs = GetLeadingUtf16Char(pair);
				220	const uint16_t rhs = *utf16++;
				221	--utf16_length;
				222	if (lhs != rhs) {
				223	return lhs > rhs ? 1 : -1;
				224	}
				225
				226	// Then compare the trailing utf16 char. First check if there
				227	// are any characters left to consume.
				228	const uint16_t lhs2 = GetTrailingUtf16Char(pair);
				229	if (lhs2 != 0) {
				230	if (utf16_length == 0) {
				231	return 1;
				232	}
				233
				234	const uint16_t rhs2 = *utf16++;
				235	--utf16_length;
				236	if (lhs2 != rhs2) {
				237	return lhs2 > rhs2 ? 1 : -1;
				238	}
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	239	}
				240	}
				241	}
				242
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	243	size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	244	size_t result = 0;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	245	const uint16_t *end = chars + char_count;
				246	while (chars < end) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	247	const uint16_t ch = *chars++;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	248	if (LIKELY(ch != 0 && ch < 0x80)) {
				249	result++;
				250	continue;
				251	}
				252	if (ch < 0x800) {
				253	result += 2;
				254	continue;
				255	}
Chuck Liao	1b9d442	2021-07-12 01:25:23 +0000	[diff] [blame]	256	if (ch >= 0xd800 && ch < 0xdc00) {
				257	if (chars < end) {
				258	const uint16_t ch2 = *chars;
				259	// If we find a properly paired surrogate, we emit it as a 4 byte
				260	// UTF sequence. If we find an unpaired leading or trailing surrogate,
				261	// we emit it as a 3 byte sequence like would have done earlier.
				262	if (ch2 >= 0xdc00 && ch2 < 0xe000) {
				263	chars++;
				264	result += 4;
				265	continue;
				266	}
				267	}
				268	}
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	269	result += 3;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	270	}
				271	return result;
				272	}
				273
David Sehr	0225f8e	2018-01-31 08:52:24 +0000	[diff] [blame]	274	static inline constexpr bool NeedsEscaping(uint16_t ch) {
				275	return (ch < ' ' \|\| ch > '~');
				276	}
				277
				278	std::string PrintableChar(uint16_t ch) {
				279	std::string result;
				280	result += '\'';
				281	if (NeedsEscaping(ch)) {
				282	StringAppendF(&result, "\\u%04x", ch);
				283	} else {
				284	result += static_cast<std::string::value_type>(ch);
				285	}
				286	result += '\'';
				287	return result;
				288	}
				289
Vladimir Marko	f1d973d	2019-03-19 13:38:34 +0000	[diff] [blame]	290	std::string PrintableString(const char* utf8) {
David Sehr	0225f8e	2018-01-31 08:52:24 +0000	[diff] [blame]	291	std::string result;
				292	result += '"';
Vladimir Marko	f1d973d	2019-03-19 13:38:34 +0000	[diff] [blame]	293	const char* p = utf8;
David Sehr	0225f8e	2018-01-31 08:52:24 +0000	[diff] [blame]	294	size_t char_count = CountModifiedUtf8Chars(p);
				295	for (size_t i = 0; i < char_count; ++i) {
				296	uint32_t ch = GetUtf16FromUtf8(&p);
				297	if (ch == '\\') {
				298	result += "\\\\";
				299	} else if (ch == '\n') {
				300	result += "\\n";
				301	} else if (ch == '\r') {
				302	result += "\\r";
				303	} else if (ch == '\t') {
				304	result += "\\t";
				305	} else {
				306	const uint16_t leading = GetLeadingUtf16Char(ch);
				307
				308	if (NeedsEscaping(leading)) {
				309	StringAppendF(&result, "\\u%04x", leading);
				310	} else {
				311	result += static_cast<std::string::value_type>(leading);
				312	}
				313
				314	const uint32_t trailing = GetTrailingUtf16Char(ch);
				315	if (trailing != 0) {
				316	// All high surrogates will need escaping.
				317	StringAppendF(&result, "\\u%04x", trailing);
Vladimir Marko	f1d973d	2019-03-19 13:38:34 +0000	[diff] [blame]	318	// Account for the surrogate pair.
				319	++i;
				320	DCHECK_LT(i, char_count);
David Sehr	0225f8e	2018-01-31 08:52:24 +0000	[diff] [blame]	321	}
				322	}
				323	}
				324	result += '"';
				325	return result;
				326	}
				327
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	328	} // namespace art