1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments. 4 * 5 * This exactly matches what is marshalled into the raw_syscall:sys_enter 6 * payload expected by the 'perf trace' beautifiers. 7 */ 8 9 #include <linux/bpf.h> 10 #include <bpf/bpf_helpers.h> 11 #include <linux/limits.h> 12 13 /** 14 * is_power_of_2() - check if a value is a power of two 15 * @n: the value to check 16 * 17 * Determine whether some value is a power of two, where zero is *not* 18 * considered a power of two. Return: true if @n is a power of 2, otherwise 19 * false. 20 */ 21 #define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0)) 22 23 #define MAX_CPUS 4096 24 25 // FIXME: These should come from system headers 26 typedef char bool; 27 typedef int pid_t; 28 typedef long long int __s64; 29 typedef __s64 time64_t; 30 31 struct timespec64 { 32 time64_t tv_sec; 33 long int tv_nsec; 34 }; 35 36 /* bpf-output associated map */ 37 struct __augmented_syscalls__ { 38 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 39 __type(key, int); 40 __type(value, __u32); 41 __uint(max_entries, MAX_CPUS); 42 } __augmented_syscalls__ SEC(".maps"); 43 44 /* 45 * What to augment at entry? 46 * 47 * Pointer arg payloads (filenames, etc) passed from userspace to the kernel 48 */ 49 struct syscalls_sys_enter { 50 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 51 __type(key, __u32); 52 __type(value, __u32); 53 __uint(max_entries, 512); 54 } syscalls_sys_enter SEC(".maps"); 55 56 /* 57 * What to augment at exit? 58 * 59 * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace. 60 */ 61 struct syscalls_sys_exit { 62 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 63 __type(key, __u32); 64 __type(value, __u32); 65 __uint(max_entries, 512); 66 } syscalls_sys_exit SEC(".maps"); 67 68 struct syscall_enter_args { 69 unsigned long long common_tp_fields; 70 long syscall_nr; 71 unsigned long args[6]; 72 }; 73 74 struct syscall_exit_args { 75 unsigned long long common_tp_fields; 76 long syscall_nr; 77 long ret; 78 }; 79 80 struct augmented_arg { 81 unsigned int size; 82 int err; 83 char value[PATH_MAX]; 84 }; 85 86 struct pids_filtered { 87 __uint(type, BPF_MAP_TYPE_HASH); 88 __type(key, pid_t); 89 __type(value, bool); 90 __uint(max_entries, 64); 91 } pids_filtered SEC(".maps"); 92 93 /* 94 * Desired design of maximum size and alignment (see RFC2553) 95 */ 96 #define SS_MAXSIZE 128 /* Implementation specific max size */ 97 98 typedef unsigned short sa_family_t; 99 100 /* 101 * FIXME: Should come from system headers 102 * 103 * The definition uses anonymous union and struct in order to control the 104 * default alignment. 105 */ 106 struct sockaddr_storage { 107 union { 108 struct { 109 sa_family_t ss_family; /* address family */ 110 /* Following field(s) are implementation specific */ 111 char __data[SS_MAXSIZE - sizeof(unsigned short)]; 112 /* space to achieve desired size, */ 113 /* _SS_MAXSIZE value minus size of ss_family */ 114 }; 115 void *__align; /* implementation specific desired alignment */ 116 }; 117 }; 118 119 struct augmented_args_payload { 120 struct syscall_enter_args args; 121 union { 122 struct { 123 struct augmented_arg arg, arg2; 124 }; 125 struct sockaddr_storage saddr; 126 char __data[sizeof(struct augmented_arg)]; 127 }; 128 }; 129 130 // We need more tmp space than the BPF stack can give us 131 struct augmented_args_tmp { 132 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 133 __type(key, int); 134 __type(value, struct augmented_args_payload); 135 __uint(max_entries, 1); 136 } augmented_args_tmp SEC(".maps"); 137 138 static inline struct augmented_args_payload *augmented_args_payload(void) 139 { 140 int key = 0; 141 return bpf_map_lookup_elem(&augmented_args_tmp, &key); 142 } 143 144 static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len) 145 { 146 /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */ 147 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len); 148 } 149 150 static inline 151 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len) 152 { 153 unsigned int augmented_len = sizeof(*augmented_arg); 154 int string_len = bpf_probe_read_str(&augmented_arg->value, arg_len, arg); 155 156 augmented_arg->size = augmented_arg->err = 0; 157 /* 158 * probe_read_str may return < 0, e.g. -EFAULT 159 * So we leave that in the augmented_arg->size that userspace will 160 */ 161 if (string_len > 0) { 162 augmented_len -= sizeof(augmented_arg->value) - string_len; 163 _Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two"); 164 augmented_len &= sizeof(augmented_arg->value) - 1; 165 augmented_arg->size = string_len; 166 } else { 167 /* 168 * So that username notice the error while still being able 169 * to skip this augmented arg record 170 */ 171 augmented_arg->err = string_len; 172 augmented_len = offsetof(struct augmented_arg, value); 173 } 174 175 return augmented_len; 176 } 177 178 SEC("tp/raw_syscalls/sys_enter") 179 int syscall_unaugmented(struct syscall_enter_args *args) 180 { 181 return 1; 182 } 183 184 /* 185 * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in 186 * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go 187 * on from there, reading the first syscall arg as a string, i.e. open's 188 * filename. 189 */ 190 SEC("tp/syscalls/sys_enter_connect") 191 int sys_enter_connect(struct syscall_enter_args *args) 192 { 193 struct augmented_args_payload *augmented_args = augmented_args_payload(); 194 const void *sockaddr_arg = (const void *)args->args[1]; 195 unsigned int socklen = args->args[2]; 196 unsigned int len = sizeof(augmented_args->args); 197 198 if (augmented_args == NULL) 199 return 1; /* Failure: don't filter */ 200 201 _Static_assert(is_power_of_2(sizeof(augmented_args->saddr)), "sizeof(augmented_args->saddr) needs to be a power of two"); 202 socklen &= sizeof(augmented_args->saddr) - 1; 203 204 bpf_probe_read(&augmented_args->saddr, socklen, sockaddr_arg); 205 206 return augmented__output(args, augmented_args, len + socklen); 207 } 208 209 SEC("tp/syscalls/sys_enter_sendto") 210 int sys_enter_sendto(struct syscall_enter_args *args) 211 { 212 struct augmented_args_payload *augmented_args = augmented_args_payload(); 213 const void *sockaddr_arg = (const void *)args->args[4]; 214 unsigned int socklen = args->args[5]; 215 unsigned int len = sizeof(augmented_args->args); 216 217 if (augmented_args == NULL) 218 return 1; /* Failure: don't filter */ 219 220 socklen &= sizeof(augmented_args->saddr) - 1; 221 222 bpf_probe_read(&augmented_args->saddr, socklen, sockaddr_arg); 223 224 return augmented__output(args, augmented_args, len + socklen); 225 } 226 227 SEC("tp/syscalls/sys_enter_open") 228 int sys_enter_open(struct syscall_enter_args *args) 229 { 230 struct augmented_args_payload *augmented_args = augmented_args_payload(); 231 const void *filename_arg = (const void *)args->args[0]; 232 unsigned int len = sizeof(augmented_args->args); 233 234 if (augmented_args == NULL) 235 return 1; /* Failure: don't filter */ 236 237 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 238 239 return augmented__output(args, augmented_args, len); 240 } 241 242 SEC("tp/syscalls/sys_enter_openat") 243 int sys_enter_openat(struct syscall_enter_args *args) 244 { 245 struct augmented_args_payload *augmented_args = augmented_args_payload(); 246 const void *filename_arg = (const void *)args->args[1]; 247 unsigned int len = sizeof(augmented_args->args); 248 249 if (augmented_args == NULL) 250 return 1; /* Failure: don't filter */ 251 252 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 253 254 return augmented__output(args, augmented_args, len); 255 } 256 257 SEC("tp/syscalls/sys_enter_rename") 258 int sys_enter_rename(struct syscall_enter_args *args) 259 { 260 struct augmented_args_payload *augmented_args = augmented_args_payload(); 261 const void *oldpath_arg = (const void *)args->args[0], 262 *newpath_arg = (const void *)args->args[1]; 263 unsigned int len = sizeof(augmented_args->args), oldpath_len; 264 265 if (augmented_args == NULL) 266 return 1; /* Failure: don't filter */ 267 268 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 269 len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value)); 270 271 return augmented__output(args, augmented_args, len); 272 } 273 274 SEC("tp/syscalls/sys_enter_renameat") 275 int sys_enter_renameat(struct syscall_enter_args *args) 276 { 277 struct augmented_args_payload *augmented_args = augmented_args_payload(); 278 const void *oldpath_arg = (const void *)args->args[1], 279 *newpath_arg = (const void *)args->args[3]; 280 unsigned int len = sizeof(augmented_args->args), oldpath_len; 281 282 if (augmented_args == NULL) 283 return 1; /* Failure: don't filter */ 284 285 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 286 len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value)); 287 288 return augmented__output(args, augmented_args, len); 289 } 290 291 #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ 292 293 // we need just the start, get the size to then copy it 294 struct perf_event_attr_size { 295 __u32 type; 296 /* 297 * Size of the attr structure, for fwd/bwd compat. 298 */ 299 __u32 size; 300 }; 301 302 SEC("tp/syscalls/sys_enter_perf_event_open") 303 int sys_enter_perf_event_open(struct syscall_enter_args *args) 304 { 305 struct augmented_args_payload *augmented_args = augmented_args_payload(); 306 const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read; 307 unsigned int len = sizeof(augmented_args->args); 308 309 if (augmented_args == NULL) 310 goto failure; 311 312 if (bpf_probe_read(&augmented_args->__data, sizeof(*attr), attr) < 0) 313 goto failure; 314 315 attr_read = (const struct perf_event_attr_size *)augmented_args->__data; 316 317 __u32 size = attr_read->size; 318 319 if (!size) 320 size = PERF_ATTR_SIZE_VER0; 321 322 if (size > sizeof(augmented_args->__data)) 323 goto failure; 324 325 // Now that we read attr->size and tested it against the size limits, read it completely 326 if (bpf_probe_read(&augmented_args->__data, size, attr) < 0) 327 goto failure; 328 329 return augmented__output(args, augmented_args, len + size); 330 failure: 331 return 1; /* Failure: don't filter */ 332 } 333 334 SEC("tp/syscalls/sys_enter_clock_nanosleep") 335 int sys_enter_clock_nanosleep(struct syscall_enter_args *args) 336 { 337 struct augmented_args_payload *augmented_args = augmented_args_payload(); 338 const void *rqtp_arg = (const void *)args->args[2]; 339 unsigned int len = sizeof(augmented_args->args); 340 __u32 size = sizeof(struct timespec64); 341 342 if (augmented_args == NULL) 343 goto failure; 344 345 if (size > sizeof(augmented_args->__data)) 346 goto failure; 347 348 bpf_probe_read(&augmented_args->__data, size, rqtp_arg); 349 350 return augmented__output(args, augmented_args, len + size); 351 failure: 352 return 1; /* Failure: don't filter */ 353 } 354 355 static pid_t getpid(void) 356 { 357 return bpf_get_current_pid_tgid(); 358 } 359 360 static bool pid_filter__has(struct pids_filtered *pids, pid_t pid) 361 { 362 return bpf_map_lookup_elem(pids, &pid) != NULL; 363 } 364 365 SEC("tp/raw_syscalls/sys_enter") 366 int sys_enter(struct syscall_enter_args *args) 367 { 368 struct augmented_args_payload *augmented_args; 369 /* 370 * We start len, the amount of data that will be in the perf ring 371 * buffer, if this is not filtered out by one of pid_filter__has(), 372 * syscall->enabled, etc, with the non-augmented raw syscall payload, 373 * i.e. sizeof(augmented_args->args). 374 * 375 * We'll add to this as we add augmented syscalls right after that 376 * initial, non-augmented raw_syscalls:sys_enter payload. 377 */ 378 379 if (pid_filter__has(&pids_filtered, getpid())) 380 return 0; 381 382 augmented_args = augmented_args_payload(); 383 if (augmented_args == NULL) 384 return 1; 385 386 bpf_probe_read(&augmented_args->args, sizeof(augmented_args->args), args); 387 388 /* 389 * Jump to syscall specific augmenter, even if the default one, 390 * "!raw_syscalls:unaugmented" that will just return 1 to return the 391 * unaugmented tracepoint payload. 392 */ 393 bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr); 394 395 // If not found on the PROG_ARRAY syscalls map, then we're filtering it: 396 return 0; 397 } 398 399 SEC("tp/raw_syscalls/sys_exit") 400 int sys_exit(struct syscall_exit_args *args) 401 { 402 struct syscall_exit_args exit_args; 403 404 if (pid_filter__has(&pids_filtered, getpid())) 405 return 0; 406 407 bpf_probe_read(&exit_args, sizeof(exit_args), args); 408 /* 409 * Jump to syscall specific return augmenter, even if the default one, 410 * "!raw_syscalls:unaugmented" that will just return 1 to return the 411 * unaugmented tracepoint payload. 412 */ 413 bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr); 414 /* 415 * If not found on the PROG_ARRAY syscalls map, then we're filtering it: 416 */ 417 return 0; 418 } 419 420 char _license[] SEC("license") = "GPL"; 421