1 // SPDX-License-Identifier: GPL-2.0 2 #include <string.h> 3 #include "util.h" 4 #include "debug.h" 5 6 #include "demangle-rust.h" 7 8 /* 9 * Mangled Rust symbols look like this: 10 * 11 * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a 12 * 13 * The original symbol is: 14 * 15 * <std::sys::fd::FileDesc as core::ops::Drop>::drop 16 * 17 * The last component of the path is a 64-bit hash in lowercase hex, prefixed 18 * with "h". Rust does not have a global namespace between crates, an illusion 19 * which Rust maintains by using the hash to distinguish things that would 20 * otherwise have the same symbol. 21 * 22 * Any path component not starting with a XID_Start character is prefixed with 23 * "_". 24 * 25 * The following escape sequences are used: 26 * 27 * "," => $C$ 28 * "@" => $SP$ 29 * "*" => $BP$ 30 * "&" => $RF$ 31 * "<" => $LT$ 32 * ">" => $GT$ 33 * "(" => $LP$ 34 * ")" => $RP$ 35 * " " => $u20$ 36 * "'" => $u27$ 37 * "[" => $u5b$ 38 * "]" => $u5d$ 39 * "~" => $u7e$ 40 * 41 * A double ".." means "::" and a single "." means "-". 42 * 43 * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ 44 */ 45 46 static const char *hash_prefix = "::h"; 47 static const size_t hash_prefix_len = 3; 48 static const size_t hash_len = 16; 49 50 static bool is_prefixed_hash(const char *start); 51 static bool looks_like_rust(const char *sym, size_t len); 52 static bool unescape(const char **in, char **out, const char *seq, char value); 53 54 /* 55 * INPUT: 56 * sym: symbol that has been through BFD-demangling 57 * 58 * This function looks for the following indicators: 59 * 60 * 1. The hash must consist of "h" followed by 16 lowercase hex digits. 61 * 62 * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible 63 * hex digits. This is true of 99.9998% of hashes so once in your life you 64 * may see a false negative. The point is to notice path components that 65 * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In 66 * this case a false positive (non-Rust symbol has an important path 67 * component removed because it looks like a Rust hash) is worse than a 68 * false negative (the rare Rust symbol is not demangled) so this sets the 69 * balance in favor of false negatives. 70 * 71 * 3. There must be no characters other than a-zA-Z0-9 and _.:$ 72 * 73 * 4. There must be no unrecognized $-sign sequences. 74 * 75 * 5. There must be no sequence of three or more dots in a row ("..."). 76 */ 77 bool 78 rust_is_mangled(const char *sym) 79 { 80 size_t len, len_without_hash; 81 82 if (!sym) 83 return false; 84 85 len = strlen(sym); 86 if (len <= hash_prefix_len + hash_len) 87 /* Not long enough to contain "::h" + hash + something else */ 88 return false; 89 90 len_without_hash = len - (hash_prefix_len + hash_len); 91 if (!is_prefixed_hash(sym + len_without_hash)) 92 return false; 93 94 return looks_like_rust(sym, len_without_hash); 95 } 96 97 /* 98 * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex 99 * digits must comprise between 5 and 15 (inclusive) distinct digits. 100 */ 101 static bool is_prefixed_hash(const char *str) 102 { 103 const char *end; 104 bool seen[16]; 105 size_t i; 106 int count; 107 108 if (strncmp(str, hash_prefix, hash_prefix_len)) 109 return false; 110 str += hash_prefix_len; 111 112 memset(seen, false, sizeof(seen)); 113 for (end = str + hash_len; str < end; str++) 114 if (*str >= '0' && *str <= '9') 115 seen[*str - '0'] = true; 116 else if (*str >= 'a' && *str <= 'f') 117 seen[*str - 'a' + 10] = true; 118 else 119 return false; 120 121 /* Count how many distinct digits seen */ 122 count = 0; 123 for (i = 0; i < 16; i++) 124 if (seen[i]) 125 count++; 126 127 return count >= 5 && count <= 15; 128 } 129 130 static bool looks_like_rust(const char *str, size_t len) 131 { 132 const char *end = str + len; 133 134 while (str < end) 135 switch (*str) { 136 case '$': 137 if (!strncmp(str, "$C$", 3)) 138 str += 3; 139 else if (!strncmp(str, "$SP$", 4) 140 || !strncmp(str, "$BP$", 4) 141 || !strncmp(str, "$RF$", 4) 142 || !strncmp(str, "$LT$", 4) 143 || !strncmp(str, "$GT$", 4) 144 || !strncmp(str, "$LP$", 4) 145 || !strncmp(str, "$RP$", 4)) 146 str += 4; 147 else if (!strncmp(str, "$u20$", 5) 148 || !strncmp(str, "$u27$", 5) 149 || !strncmp(str, "$u5b$", 5) 150 || !strncmp(str, "$u5d$", 5) 151 || !strncmp(str, "$u7e$", 5)) 152 str += 5; 153 else 154 return false; 155 break; 156 case '.': 157 /* Do not allow three or more consecutive dots */ 158 if (!strncmp(str, "...", 3)) 159 return false; 160 /* Fall through */ 161 case 'a' ... 'z': 162 case 'A' ... 'Z': 163 case '0' ... '9': 164 case '_': 165 case ':': 166 str++; 167 break; 168 default: 169 return false; 170 } 171 172 return true; 173 } 174 175 /* 176 * INPUT: 177 * sym: symbol for which rust_is_mangled(sym) returns true 178 * 179 * The input is demangled in-place because the mangled name is always longer 180 * than the demangled one. 181 */ 182 void 183 rust_demangle_sym(char *sym) 184 { 185 const char *in; 186 char *out; 187 const char *end; 188 189 if (!sym) 190 return; 191 192 in = sym; 193 out = sym; 194 end = sym + strlen(sym) - (hash_prefix_len + hash_len); 195 196 while (in < end) 197 switch (*in) { 198 case '$': 199 if (!(unescape(&in, &out, "$C$", ',') 200 || unescape(&in, &out, "$SP$", '@') 201 || unescape(&in, &out, "$BP$", '*') 202 || unescape(&in, &out, "$RF$", '&') 203 || unescape(&in, &out, "$LT$", '<') 204 || unescape(&in, &out, "$GT$", '>') 205 || unescape(&in, &out, "$LP$", '(') 206 || unescape(&in, &out, "$RP$", ')') 207 || unescape(&in, &out, "$u20$", ' ') 208 || unescape(&in, &out, "$u27$", '\'') 209 || unescape(&in, &out, "$u5b$", '[') 210 || unescape(&in, &out, "$u5d$", ']') 211 || unescape(&in, &out, "$u7e$", '~'))) { 212 pr_err("demangle-rust: unexpected escape sequence"); 213 goto done; 214 } 215 break; 216 case '_': 217 /* 218 * If this is the start of a path component and the next 219 * character is an escape sequence, ignore the 220 * underscore. The mangler inserts an underscore to make 221 * sure the path component begins with a XID_Start 222 * character. 223 */ 224 if ((in == sym || in[-1] == ':') && in[1] == '$') 225 in++; 226 else 227 *out++ = *in++; 228 break; 229 case '.': 230 if (in[1] == '.') { 231 /* ".." becomes "::" */ 232 *out++ = ':'; 233 *out++ = ':'; 234 in += 2; 235 } else { 236 /* "." becomes "-" */ 237 *out++ = '-'; 238 in++; 239 } 240 break; 241 case 'a' ... 'z': 242 case 'A' ... 'Z': 243 case '0' ... '9': 244 case ':': 245 *out++ = *in++; 246 break; 247 default: 248 pr_err("demangle-rust: unexpected character '%c' in symbol\n", 249 *in); 250 goto done; 251 } 252 253 done: 254 *out = '\0'; 255 } 256 257 static bool unescape(const char **in, char **out, const char *seq, char value) 258 { 259 size_t len = strlen(seq); 260 261 if (strncmp(*in, seq, len)) 262 return false; 263 264 **out = value; 265 266 *in += len; 267 *out += 1; 268 269 return true; 270 } 271