1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments. 4 * 5 * This exactly matches what is marshalled into the raw_syscall:sys_enter 6 * payload expected by the 'perf trace' beautifiers. 7 */ 8 9 #include <linux/bpf.h> 10 #include <bpf/bpf_helpers.h> 11 #include <linux/limits.h> 12 13 /** 14 * is_power_of_2() - check if a value is a power of two 15 * @n: the value to check 16 * 17 * Determine whether some value is a power of two, where zero is *not* 18 * considered a power of two. Return: true if @n is a power of 2, otherwise 19 * false. 20 */ 21 #define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0)) 22 23 #define MAX_CPUS 4096 24 25 // FIXME: These should come from system headers 26 #ifndef bool 27 typedef char bool; 28 #endif 29 typedef int pid_t; 30 typedef long long int __s64; 31 typedef __s64 time64_t; 32 33 struct timespec64 { 34 time64_t tv_sec; 35 long int tv_nsec; 36 }; 37 38 /* bpf-output associated map */ 39 struct __augmented_syscalls__ { 40 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 41 __type(key, int); 42 __type(value, __u32); 43 __uint(max_entries, MAX_CPUS); 44 } __augmented_syscalls__ SEC(".maps"); 45 46 /* 47 * What to augment at entry? 48 * 49 * Pointer arg payloads (filenames, etc) passed from userspace to the kernel 50 */ 51 struct syscalls_sys_enter { 52 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 53 __type(key, __u32); 54 __type(value, __u32); 55 __uint(max_entries, 512); 56 } syscalls_sys_enter SEC(".maps"); 57 58 /* 59 * What to augment at exit? 60 * 61 * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace. 62 */ 63 struct syscalls_sys_exit { 64 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 65 __type(key, __u32); 66 __type(value, __u32); 67 __uint(max_entries, 512); 68 } syscalls_sys_exit SEC(".maps"); 69 70 struct syscall_enter_args { 71 unsigned long long common_tp_fields; 72 long syscall_nr; 73 unsigned long args[6]; 74 }; 75 76 struct syscall_exit_args { 77 unsigned long long common_tp_fields; 78 long syscall_nr; 79 long ret; 80 }; 81 82 struct augmented_arg { 83 unsigned int size; 84 int err; 85 char value[PATH_MAX]; 86 }; 87 88 struct pids_filtered { 89 __uint(type, BPF_MAP_TYPE_HASH); 90 __type(key, pid_t); 91 __type(value, bool); 92 __uint(max_entries, 64); 93 } pids_filtered SEC(".maps"); 94 95 /* 96 * Desired design of maximum size and alignment (see RFC2553) 97 */ 98 #define SS_MAXSIZE 128 /* Implementation specific max size */ 99 100 typedef unsigned short sa_family_t; 101 102 /* 103 * FIXME: Should come from system headers 104 * 105 * The definition uses anonymous union and struct in order to control the 106 * default alignment. 107 */ 108 struct sockaddr_storage { 109 union { 110 struct { 111 sa_family_t ss_family; /* address family */ 112 /* Following field(s) are implementation specific */ 113 char __data[SS_MAXSIZE - sizeof(unsigned short)]; 114 /* space to achieve desired size, */ 115 /* _SS_MAXSIZE value minus size of ss_family */ 116 }; 117 void *__align; /* implementation specific desired alignment */ 118 }; 119 }; 120 121 struct augmented_args_payload { 122 struct syscall_enter_args args; 123 union { 124 struct { 125 struct augmented_arg arg, arg2; 126 }; 127 struct sockaddr_storage saddr; 128 char __data[sizeof(struct augmented_arg)]; 129 }; 130 }; 131 132 // We need more tmp space than the BPF stack can give us 133 struct augmented_args_tmp { 134 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 135 __type(key, int); 136 __type(value, struct augmented_args_payload); 137 __uint(max_entries, 1); 138 } augmented_args_tmp SEC(".maps"); 139 140 static inline struct augmented_args_payload *augmented_args_payload(void) 141 { 142 int key = 0; 143 return bpf_map_lookup_elem(&augmented_args_tmp, &key); 144 } 145 146 static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len) 147 { 148 /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */ 149 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len); 150 } 151 152 static inline 153 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len) 154 { 155 unsigned int augmented_len = sizeof(*augmented_arg); 156 int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg); 157 158 augmented_arg->size = augmented_arg->err = 0; 159 /* 160 * probe_read_str may return < 0, e.g. -EFAULT 161 * So we leave that in the augmented_arg->size that userspace will 162 */ 163 if (string_len > 0) { 164 augmented_len -= sizeof(augmented_arg->value) - string_len; 165 _Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two"); 166 augmented_len &= sizeof(augmented_arg->value) - 1; 167 augmented_arg->size = string_len; 168 } else { 169 /* 170 * So that username notice the error while still being able 171 * to skip this augmented arg record 172 */ 173 augmented_arg->err = string_len; 174 augmented_len = offsetof(struct augmented_arg, value); 175 } 176 177 return augmented_len; 178 } 179 180 SEC("tp/raw_syscalls/sys_enter") 181 int syscall_unaugmented(struct syscall_enter_args *args) 182 { 183 return 1; 184 } 185 186 /* 187 * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in 188 * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go 189 * on from there, reading the first syscall arg as a string, i.e. open's 190 * filename. 191 */ 192 SEC("tp/syscalls/sys_enter_connect") 193 int sys_enter_connect(struct syscall_enter_args *args) 194 { 195 struct augmented_args_payload *augmented_args = augmented_args_payload(); 196 const void *sockaddr_arg = (const void *)args->args[1]; 197 unsigned int socklen = args->args[2]; 198 unsigned int len = sizeof(augmented_args->args); 199 200 if (augmented_args == NULL) 201 return 1; /* Failure: don't filter */ 202 203 _Static_assert(is_power_of_2(sizeof(augmented_args->saddr)), "sizeof(augmented_args->saddr) needs to be a power of two"); 204 socklen &= sizeof(augmented_args->saddr) - 1; 205 206 bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg); 207 208 return augmented__output(args, augmented_args, len + socklen); 209 } 210 211 SEC("tp/syscalls/sys_enter_sendto") 212 int sys_enter_sendto(struct syscall_enter_args *args) 213 { 214 struct augmented_args_payload *augmented_args = augmented_args_payload(); 215 const void *sockaddr_arg = (const void *)args->args[4]; 216 unsigned int socklen = args->args[5]; 217 unsigned int len = sizeof(augmented_args->args); 218 219 if (augmented_args == NULL) 220 return 1; /* Failure: don't filter */ 221 222 socklen &= sizeof(augmented_args->saddr) - 1; 223 224 bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg); 225 226 return augmented__output(args, augmented_args, len + socklen); 227 } 228 229 SEC("tp/syscalls/sys_enter_open") 230 int sys_enter_open(struct syscall_enter_args *args) 231 { 232 struct augmented_args_payload *augmented_args = augmented_args_payload(); 233 const void *filename_arg = (const void *)args->args[0]; 234 unsigned int len = sizeof(augmented_args->args); 235 236 if (augmented_args == NULL) 237 return 1; /* Failure: don't filter */ 238 239 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 240 241 return augmented__output(args, augmented_args, len); 242 } 243 244 SEC("tp/syscalls/sys_enter_openat") 245 int sys_enter_openat(struct syscall_enter_args *args) 246 { 247 struct augmented_args_payload *augmented_args = augmented_args_payload(); 248 const void *filename_arg = (const void *)args->args[1]; 249 unsigned int len = sizeof(augmented_args->args); 250 251 if (augmented_args == NULL) 252 return 1; /* Failure: don't filter */ 253 254 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 255 256 return augmented__output(args, augmented_args, len); 257 } 258 259 SEC("tp/syscalls/sys_enter_rename") 260 int sys_enter_rename(struct syscall_enter_args *args) 261 { 262 struct augmented_args_payload *augmented_args = augmented_args_payload(); 263 const void *oldpath_arg = (const void *)args->args[0], 264 *newpath_arg = (const void *)args->args[1]; 265 unsigned int len = sizeof(augmented_args->args), oldpath_len; 266 267 if (augmented_args == NULL) 268 return 1; /* Failure: don't filter */ 269 270 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 271 len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value)); 272 273 return augmented__output(args, augmented_args, len); 274 } 275 276 SEC("tp/syscalls/sys_enter_renameat") 277 int sys_enter_renameat(struct syscall_enter_args *args) 278 { 279 struct augmented_args_payload *augmented_args = augmented_args_payload(); 280 const void *oldpath_arg = (const void *)args->args[1], 281 *newpath_arg = (const void *)args->args[3]; 282 unsigned int len = sizeof(augmented_args->args), oldpath_len; 283 284 if (augmented_args == NULL) 285 return 1; /* Failure: don't filter */ 286 287 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 288 len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value)); 289 290 return augmented__output(args, augmented_args, len); 291 } 292 293 #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ 294 295 // we need just the start, get the size to then copy it 296 struct perf_event_attr_size { 297 __u32 type; 298 /* 299 * Size of the attr structure, for fwd/bwd compat. 300 */ 301 __u32 size; 302 }; 303 304 SEC("tp/syscalls/sys_enter_perf_event_open") 305 int sys_enter_perf_event_open(struct syscall_enter_args *args) 306 { 307 struct augmented_args_payload *augmented_args = augmented_args_payload(); 308 const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read; 309 unsigned int len = sizeof(augmented_args->args); 310 311 if (augmented_args == NULL) 312 goto failure; 313 314 if (bpf_probe_read_user(&augmented_args->__data, sizeof(*attr), attr) < 0) 315 goto failure; 316 317 attr_read = (const struct perf_event_attr_size *)augmented_args->__data; 318 319 __u32 size = attr_read->size; 320 321 if (!size) 322 size = PERF_ATTR_SIZE_VER0; 323 324 if (size > sizeof(augmented_args->__data)) 325 goto failure; 326 327 // Now that we read attr->size and tested it against the size limits, read it completely 328 if (bpf_probe_read_user(&augmented_args->__data, size, attr) < 0) 329 goto failure; 330 331 return augmented__output(args, augmented_args, len + size); 332 failure: 333 return 1; /* Failure: don't filter */ 334 } 335 336 SEC("tp/syscalls/sys_enter_clock_nanosleep") 337 int sys_enter_clock_nanosleep(struct syscall_enter_args *args) 338 { 339 struct augmented_args_payload *augmented_args = augmented_args_payload(); 340 const void *rqtp_arg = (const void *)args->args[2]; 341 unsigned int len = sizeof(augmented_args->args); 342 __u32 size = sizeof(struct timespec64); 343 344 if (augmented_args == NULL) 345 goto failure; 346 347 if (size > sizeof(augmented_args->__data)) 348 goto failure; 349 350 bpf_probe_read_user(&augmented_args->__data, size, rqtp_arg); 351 352 return augmented__output(args, augmented_args, len + size); 353 failure: 354 return 1; /* Failure: don't filter */ 355 } 356 357 static pid_t getpid(void) 358 { 359 return bpf_get_current_pid_tgid(); 360 } 361 362 static bool pid_filter__has(struct pids_filtered *pids, pid_t pid) 363 { 364 return bpf_map_lookup_elem(pids, &pid) != NULL; 365 } 366 367 SEC("tp/raw_syscalls/sys_enter") 368 int sys_enter(struct syscall_enter_args *args) 369 { 370 struct augmented_args_payload *augmented_args; 371 /* 372 * We start len, the amount of data that will be in the perf ring 373 * buffer, if this is not filtered out by one of pid_filter__has(), 374 * syscall->enabled, etc, with the non-augmented raw syscall payload, 375 * i.e. sizeof(augmented_args->args). 376 * 377 * We'll add to this as we add augmented syscalls right after that 378 * initial, non-augmented raw_syscalls:sys_enter payload. 379 */ 380 381 if (pid_filter__has(&pids_filtered, getpid())) 382 return 0; 383 384 augmented_args = augmented_args_payload(); 385 if (augmented_args == NULL) 386 return 1; 387 388 bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args); 389 390 /* 391 * Jump to syscall specific augmenter, even if the default one, 392 * "!raw_syscalls:unaugmented" that will just return 1 to return the 393 * unaugmented tracepoint payload. 394 */ 395 bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr); 396 397 // If not found on the PROG_ARRAY syscalls map, then we're filtering it: 398 return 0; 399 } 400 401 SEC("tp/raw_syscalls/sys_exit") 402 int sys_exit(struct syscall_exit_args *args) 403 { 404 struct syscall_exit_args exit_args; 405 406 if (pid_filter__has(&pids_filtered, getpid())) 407 return 0; 408 409 bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args); 410 /* 411 * Jump to syscall specific return augmenter, even if the default one, 412 * "!raw_syscalls:unaugmented" that will just return 1 to return the 413 * unaugmented tracepoint payload. 414 */ 415 bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr); 416 /* 417 * If not found on the PROG_ARRAY syscalls map, then we're filtering it: 418 */ 419 return 0; 420 } 421 422 char _license[] SEC("license") = "GPL"; 423