xref: /openbmc/linux/tools/perf/util/demangle-rust.c (revision 9be08a27)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <string.h>
3 #include "util.h"
4 #include "debug.h"
5 
6 #include "demangle-rust.h"
7 
8 /*
9  * Mangled Rust symbols look like this:
10  *
11  *     _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
12  *
13  * The original symbol is:
14  *
15  *     <std::sys::fd::FileDesc as core::ops::Drop>::drop
16  *
17  * The last component of the path is a 64-bit hash in lowercase hex, prefixed
18  * with "h". Rust does not have a global namespace between crates, an illusion
19  * which Rust maintains by using the hash to distinguish things that would
20  * otherwise have the same symbol.
21  *
22  * Any path component not starting with a XID_Start character is prefixed with
23  * "_".
24  *
25  * The following escape sequences are used:
26  *
27  *     ","  =>  $C$
28  *     "@"  =>  $SP$
29  *     "*"  =>  $BP$
30  *     "&"  =>  $RF$
31  *     "<"  =>  $LT$
32  *     ">"  =>  $GT$
33  *     "("  =>  $LP$
34  *     ")"  =>  $RP$
35  *     " "  =>  $u20$
36  *     "'"  =>  $u27$
37  *     "["  =>  $u5b$
38  *     "]"  =>  $u5d$
39  *     "~"  =>  $u7e$
40  *
41  * A double ".." means "::" and a single "." means "-".
42  *
43  * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
44  */
45 
46 static const char *hash_prefix = "::h";
47 static const size_t hash_prefix_len = 3;
48 static const size_t hash_len = 16;
49 
50 static bool is_prefixed_hash(const char *start);
51 static bool looks_like_rust(const char *sym, size_t len);
52 static bool unescape(const char **in, char **out, const char *seq, char value);
53 
54 /*
55  * INPUT:
56  *     sym: symbol that has been through BFD-demangling
57  *
58  * This function looks for the following indicators:
59  *
60  *  1. The hash must consist of "h" followed by 16 lowercase hex digits.
61  *
62  *  2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
63  *     hex digits. This is true of 99.9998% of hashes so once in your life you
64  *     may see a false negative. The point is to notice path components that
65  *     could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
66  *     this case a false positive (non-Rust symbol has an important path
67  *     component removed because it looks like a Rust hash) is worse than a
68  *     false negative (the rare Rust symbol is not demangled) so this sets the
69  *     balance in favor of false negatives.
70  *
71  *  3. There must be no characters other than a-zA-Z0-9 and _.:$
72  *
73  *  4. There must be no unrecognized $-sign sequences.
74  *
75  *  5. There must be no sequence of three or more dots in a row ("...").
76  */
77 bool
78 rust_is_mangled(const char *sym)
79 {
80 	size_t len, len_without_hash;
81 
82 	if (!sym)
83 		return false;
84 
85 	len = strlen(sym);
86 	if (len <= hash_prefix_len + hash_len)
87 		/* Not long enough to contain "::h" + hash + something else */
88 		return false;
89 
90 	len_without_hash = len - (hash_prefix_len + hash_len);
91 	if (!is_prefixed_hash(sym + len_without_hash))
92 		return false;
93 
94 	return looks_like_rust(sym, len_without_hash);
95 }
96 
97 /*
98  * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
99  * digits must comprise between 5 and 15 (inclusive) distinct digits.
100  */
101 static bool is_prefixed_hash(const char *str)
102 {
103 	const char *end;
104 	bool seen[16];
105 	size_t i;
106 	int count;
107 
108 	if (strncmp(str, hash_prefix, hash_prefix_len))
109 		return false;
110 	str += hash_prefix_len;
111 
112 	memset(seen, false, sizeof(seen));
113 	for (end = str + hash_len; str < end; str++)
114 		if (*str >= '0' && *str <= '9')
115 			seen[*str - '0'] = true;
116 		else if (*str >= 'a' && *str <= 'f')
117 			seen[*str - 'a' + 10] = true;
118 		else
119 			return false;
120 
121 	/* Count how many distinct digits seen */
122 	count = 0;
123 	for (i = 0; i < 16; i++)
124 		if (seen[i])
125 			count++;
126 
127 	return count >= 5 && count <= 15;
128 }
129 
130 static bool looks_like_rust(const char *str, size_t len)
131 {
132 	const char *end = str + len;
133 
134 	while (str < end)
135 		switch (*str) {
136 		case '$':
137 			if (!strncmp(str, "$C$", 3))
138 				str += 3;
139 			else if (!strncmp(str, "$SP$", 4)
140 					|| !strncmp(str, "$BP$", 4)
141 					|| !strncmp(str, "$RF$", 4)
142 					|| !strncmp(str, "$LT$", 4)
143 					|| !strncmp(str, "$GT$", 4)
144 					|| !strncmp(str, "$LP$", 4)
145 					|| !strncmp(str, "$RP$", 4))
146 				str += 4;
147 			else if (!strncmp(str, "$u20$", 5)
148 					|| !strncmp(str, "$u27$", 5)
149 					|| !strncmp(str, "$u5b$", 5)
150 					|| !strncmp(str, "$u5d$", 5)
151 					|| !strncmp(str, "$u7e$", 5))
152 				str += 5;
153 			else
154 				return false;
155 			break;
156 		case '.':
157 			/* Do not allow three or more consecutive dots */
158 			if (!strncmp(str, "...", 3))
159 				return false;
160 			/* Fall through */
161 		case 'a' ... 'z':
162 		case 'A' ... 'Z':
163 		case '0' ... '9':
164 		case '_':
165 		case ':':
166 			str++;
167 			break;
168 		default:
169 			return false;
170 		}
171 
172 	return true;
173 }
174 
175 /*
176  * INPUT:
177  *     sym: symbol for which rust_is_mangled(sym) returns true
178  *
179  * The input is demangled in-place because the mangled name is always longer
180  * than the demangled one.
181  */
182 void
183 rust_demangle_sym(char *sym)
184 {
185 	const char *in;
186 	char *out;
187 	const char *end;
188 
189 	if (!sym)
190 		return;
191 
192 	in = sym;
193 	out = sym;
194 	end = sym + strlen(sym) - (hash_prefix_len + hash_len);
195 
196 	while (in < end)
197 		switch (*in) {
198 		case '$':
199 			if (!(unescape(&in, &out, "$C$", ',')
200 					|| unescape(&in, &out, "$SP$", '@')
201 					|| unescape(&in, &out, "$BP$", '*')
202 					|| unescape(&in, &out, "$RF$", '&')
203 					|| unescape(&in, &out, "$LT$", '<')
204 					|| unescape(&in, &out, "$GT$", '>')
205 					|| unescape(&in, &out, "$LP$", '(')
206 					|| unescape(&in, &out, "$RP$", ')')
207 					|| unescape(&in, &out, "$u20$", ' ')
208 					|| unescape(&in, &out, "$u27$", '\'')
209 					|| unescape(&in, &out, "$u5b$", '[')
210 					|| unescape(&in, &out, "$u5d$", ']')
211 					|| unescape(&in, &out, "$u7e$", '~'))) {
212 				pr_err("demangle-rust: unexpected escape sequence");
213 				goto done;
214 			}
215 			break;
216 		case '_':
217 			/*
218 			 * If this is the start of a path component and the next
219 			 * character is an escape sequence, ignore the
220 			 * underscore. The mangler inserts an underscore to make
221 			 * sure the path component begins with a XID_Start
222 			 * character.
223 			 */
224 			if ((in == sym || in[-1] == ':') && in[1] == '$')
225 				in++;
226 			else
227 				*out++ = *in++;
228 			break;
229 		case '.':
230 			if (in[1] == '.') {
231 				/* ".." becomes "::" */
232 				*out++ = ':';
233 				*out++ = ':';
234 				in += 2;
235 			} else {
236 				/* "." becomes "-" */
237 				*out++ = '-';
238 				in++;
239 			}
240 			break;
241 		case 'a' ... 'z':
242 		case 'A' ... 'Z':
243 		case '0' ... '9':
244 		case ':':
245 			*out++ = *in++;
246 			break;
247 		default:
248 			pr_err("demangle-rust: unexpected character '%c' in symbol\n",
249 				*in);
250 			goto done;
251 		}
252 
253 done:
254 	*out = '\0';
255 }
256 
257 static bool unescape(const char **in, char **out, const char *seq, char value)
258 {
259 	size_t len = strlen(seq);
260 
261 	if (strncmp(*in, seq, len))
262 		return false;
263 
264 	**out = value;
265 
266 	*in += len;
267 	*out += 1;
268 
269 	return true;
270 }
271