1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 1999, 2023 4 */ 5 6 #include <linux/cpuhotplug.h> 7 #include <linux/sched/task.h> 8 #include <linux/errno.h> 9 #include <linux/init.h> 10 #include <linux/irq.h> 11 #include <asm/asm-extable.h> 12 #include <asm/pfault.h> 13 #include <asm/diag.h> 14 15 #define __SUBCODE_MASK 0x0600 16 #define __PF_RES_FIELD 0x8000000000000000UL 17 18 /* 19 * 'pfault' pseudo page faults routines. 20 */ 21 static int pfault_disable; 22 23 static int __init nopfault(char *str) 24 { 25 pfault_disable = 1; 26 return 1; 27 } 28 early_param("nopfault", nopfault); 29 30 struct pfault_refbk { 31 u16 refdiagc; 32 u16 reffcode; 33 u16 refdwlen; 34 u16 refversn; 35 u64 refgaddr; 36 u64 refselmk; 37 u64 refcmpmk; 38 u64 reserved; 39 }; 40 41 static struct pfault_refbk pfault_init_refbk = { 42 .refdiagc = 0x258, 43 .reffcode = 0, 44 .refdwlen = 5, 45 .refversn = 2, 46 .refgaddr = __LC_LPP, 47 .refselmk = 1UL << 48, 48 .refcmpmk = 1UL << 48, 49 .reserved = __PF_RES_FIELD 50 }; 51 52 int __pfault_init(void) 53 { 54 int rc = -EOPNOTSUPP; 55 56 if (pfault_disable) 57 return rc; 58 diag_stat_inc(DIAG_STAT_X258); 59 asm volatile( 60 " diag %[refbk],%[rc],0x258\n" 61 "0: nopr %%r7\n" 62 EX_TABLE(0b, 0b) 63 : [rc] "+d" (rc) 64 : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) 65 : "cc"); 66 return rc; 67 } 68 69 static struct pfault_refbk pfault_fini_refbk = { 70 .refdiagc = 0x258, 71 .reffcode = 1, 72 .refdwlen = 5, 73 .refversn = 2, 74 }; 75 76 void __pfault_fini(void) 77 { 78 if (pfault_disable) 79 return; 80 diag_stat_inc(DIAG_STAT_X258); 81 asm volatile( 82 " diag %[refbk],0,0x258\n" 83 "0: nopr %%r7\n" 84 EX_TABLE(0b, 0b) 85 : 86 : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) 87 : "cc"); 88 } 89 90 static DEFINE_SPINLOCK(pfault_lock); 91 static LIST_HEAD(pfault_list); 92 93 #define PF_COMPLETE 0x0080 94 95 /* 96 * The mechanism of our pfault code: if Linux is running as guest, runs a user 97 * space process and the user space process accesses a page that the host has 98 * paged out we get a pfault interrupt. 99 * 100 * This allows us, within the guest, to schedule a different process. Without 101 * this mechanism the host would have to suspend the whole virtual cpu until 102 * the page has been paged in. 103 * 104 * So when we get such an interrupt then we set the state of the current task 105 * to uninterruptible and also set the need_resched flag. Both happens within 106 * interrupt context(!). If we later on want to return to user space we 107 * recognize the need_resched flag and then call schedule(). It's not very 108 * obvious how this works... 109 * 110 * Of course we have a lot of additional fun with the completion interrupt (-> 111 * host signals that a page of a process has been paged in and the process can 112 * continue to run). This interrupt can arrive on any cpu and, since we have 113 * virtual cpus, actually appear before the interrupt that signals that a page 114 * is missing. 115 */ 116 static void pfault_interrupt(struct ext_code ext_code, 117 unsigned int param32, unsigned long param64) 118 { 119 struct task_struct *tsk; 120 __u16 subcode; 121 pid_t pid; 122 123 /* 124 * Get the external interruption subcode & pfault initial/completion 125 * signal bit. VM stores this in the 'cpu address' field associated 126 * with the external interrupt. 127 */ 128 subcode = ext_code.subcode; 129 if ((subcode & 0xff00) != __SUBCODE_MASK) 130 return; 131 inc_irq_stat(IRQEXT_PFL); 132 /* Get the token (= pid of the affected task). */ 133 pid = param64 & LPP_PID_MASK; 134 rcu_read_lock(); 135 tsk = find_task_by_pid_ns(pid, &init_pid_ns); 136 if (tsk) 137 get_task_struct(tsk); 138 rcu_read_unlock(); 139 if (!tsk) 140 return; 141 spin_lock(&pfault_lock); 142 if (subcode & PF_COMPLETE) { 143 /* signal bit is set -> a page has been swapped in by VM */ 144 if (tsk->thread.pfault_wait == 1) { 145 /* 146 * Initial interrupt was faster than the completion 147 * interrupt. pfault_wait is valid. Set pfault_wait 148 * back to zero and wake up the process. This can 149 * safely be done because the task is still sleeping 150 * and can't produce new pfaults. 151 */ 152 tsk->thread.pfault_wait = 0; 153 list_del(&tsk->thread.list); 154 wake_up_process(tsk); 155 put_task_struct(tsk); 156 } else { 157 /* 158 * Completion interrupt was faster than initial 159 * interrupt. Set pfault_wait to -1 so the initial 160 * interrupt doesn't put the task to sleep. 161 * If the task is not running, ignore the completion 162 * interrupt since it must be a leftover of a PFAULT 163 * CANCEL operation which didn't remove all pending 164 * completion interrupts. 165 */ 166 if (task_is_running(tsk)) 167 tsk->thread.pfault_wait = -1; 168 } 169 } else { 170 /* signal bit not set -> a real page is missing. */ 171 if (WARN_ON_ONCE(tsk != current)) 172 goto out; 173 if (tsk->thread.pfault_wait == 1) { 174 /* Already on the list with a reference: put to sleep */ 175 goto block; 176 } else if (tsk->thread.pfault_wait == -1) { 177 /* 178 * Completion interrupt was faster than the initial 179 * interrupt (pfault_wait == -1). Set pfault_wait 180 * back to zero and exit. 181 */ 182 tsk->thread.pfault_wait = 0; 183 } else { 184 /* 185 * Initial interrupt arrived before completion 186 * interrupt. Let the task sleep. 187 * An extra task reference is needed since a different 188 * cpu may set the task state to TASK_RUNNING again 189 * before the scheduler is reached. 190 */ 191 get_task_struct(tsk); 192 tsk->thread.pfault_wait = 1; 193 list_add(&tsk->thread.list, &pfault_list); 194 block: 195 /* 196 * Since this must be a userspace fault, there 197 * is no kernel task state to trample. Rely on the 198 * return to userspace schedule() to block. 199 */ 200 __set_current_state(TASK_UNINTERRUPTIBLE); 201 set_tsk_need_resched(tsk); 202 set_preempt_need_resched(); 203 } 204 } 205 out: 206 spin_unlock(&pfault_lock); 207 put_task_struct(tsk); 208 } 209 210 static int pfault_cpu_dead(unsigned int cpu) 211 { 212 struct thread_struct *thread, *next; 213 struct task_struct *tsk; 214 215 spin_lock_irq(&pfault_lock); 216 list_for_each_entry_safe(thread, next, &pfault_list, list) { 217 thread->pfault_wait = 0; 218 list_del(&thread->list); 219 tsk = container_of(thread, struct task_struct, thread); 220 wake_up_process(tsk); 221 put_task_struct(tsk); 222 } 223 spin_unlock_irq(&pfault_lock); 224 return 0; 225 } 226 227 static int __init pfault_irq_init(void) 228 { 229 int rc; 230 231 rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); 232 if (rc) 233 goto out_extint; 234 rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; 235 if (rc) 236 goto out_pfault; 237 irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); 238 cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", 239 NULL, pfault_cpu_dead); 240 return 0; 241 242 out_pfault: 243 unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); 244 out_extint: 245 pfault_disable = 1; 246 return rc; 247 } 248 early_initcall(pfault_irq_init); 249