1 /*
2  * intel_pstate.c: Native P state management for Intel processors
3  *
4  * (C) Copyright 2012 Intel Corporation
5  * Author: Dirk Brandewie <dirk.j.brandewie@intel.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; version 2
10  * of the License.
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/kernel.h>
16 #include <linux/kernel_stat.h>
17 #include <linux/module.h>
18 #include <linux/ktime.h>
19 #include <linux/hrtimer.h>
20 #include <linux/tick.h>
21 #include <linux/slab.h>
22 #include <linux/sched/cpufreq.h>
23 #include <linux/list.h>
24 #include <linux/cpu.h>
25 #include <linux/cpufreq.h>
26 #include <linux/sysfs.h>
27 #include <linux/types.h>
28 #include <linux/fs.h>
29 #include <linux/acpi.h>
30 #include <linux/vmalloc.h>
31 #include <trace/events/power.h>
32 
33 #include <asm/div64.h>
34 #include <asm/msr.h>
35 #include <asm/cpu_device_id.h>
36 #include <asm/cpufeature.h>
37 #include <asm/intel-family.h>
38 
39 #define INTEL_PSTATE_SAMPLING_INTERVAL	(10 * NSEC_PER_MSEC)
40 
41 #define INTEL_CPUFREQ_TRANSITION_LATENCY	20000
42 #define INTEL_CPUFREQ_TRANSITION_DELAY		500
43 
44 #ifdef CONFIG_ACPI
45 #include <acpi/processor.h>
46 #include <acpi/cppc_acpi.h>
47 #endif
48 
49 #define FRAC_BITS 8
50 #define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
51 #define fp_toint(X) ((X) >> FRAC_BITS)
52 
53 #define ONE_EIGHTH_FP ((int64_t)1 << (FRAC_BITS - 3))
54 
55 #define EXT_BITS 6
56 #define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
57 #define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS)
58 #define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS)
59 
60 static inline int32_t mul_fp(int32_t x, int32_t y)
61 {
62 	return ((int64_t)x * (int64_t)y) >> FRAC_BITS;
63 }
64 
65 static inline int32_t div_fp(s64 x, s64 y)
66 {
67 	return div64_s64((int64_t)x << FRAC_BITS, y);
68 }
69 
70 static inline int ceiling_fp(int32_t x)
71 {
72 	int mask, ret;
73 
74 	ret = fp_toint(x);
75 	mask = (1 << FRAC_BITS) - 1;
76 	if (x & mask)
77 		ret += 1;
78 	return ret;
79 }
80 
81 static inline int32_t percent_fp(int percent)
82 {
83 	return div_fp(percent, 100);
84 }
85 
86 static inline u64 mul_ext_fp(u64 x, u64 y)
87 {
88 	return (x * y) >> EXT_FRAC_BITS;
89 }
90 
91 static inline u64 div_ext_fp(u64 x, u64 y)
92 {
93 	return div64_u64(x << EXT_FRAC_BITS, y);
94 }
95 
96 static inline int32_t percent_ext_fp(int percent)
97 {
98 	return div_ext_fp(percent, 100);
99 }
100 
101 /**
102  * struct sample -	Store performance sample
103  * @core_avg_perf:	Ratio of APERF/MPERF which is the actual average
104  *			performance during last sample period
105  * @busy_scaled:	Scaled busy value which is used to calculate next
106  *			P state. This can be different than core_avg_perf
107  *			to account for cpu idle period
108  * @aperf:		Difference of actual performance frequency clock count
109  *			read from APERF MSR between last and current sample
110  * @mperf:		Difference of maximum performance frequency clock count
111  *			read from MPERF MSR between last and current sample
112  * @tsc:		Difference of time stamp counter between last and
113  *			current sample
114  * @time:		Current time from scheduler
115  *
116  * This structure is used in the cpudata structure to store performance sample
117  * data for choosing next P State.
118  */
119 struct sample {
120 	int32_t core_avg_perf;
121 	int32_t busy_scaled;
122 	u64 aperf;
123 	u64 mperf;
124 	u64 tsc;
125 	u64 time;
126 };
127 
128 /**
129  * struct pstate_data - Store P state data
130  * @current_pstate:	Current requested P state
131  * @min_pstate:		Min P state possible for this platform
132  * @max_pstate:		Max P state possible for this platform
133  * @max_pstate_physical:This is physical Max P state for a processor
134  *			This can be higher than the max_pstate which can
135  *			be limited by platform thermal design power limits
136  * @scaling:		Scaling factor to  convert frequency to cpufreq
137  *			frequency units
138  * @turbo_pstate:	Max Turbo P state possible for this platform
139  * @max_freq:		@max_pstate frequency in cpufreq units
140  * @turbo_freq:		@turbo_pstate frequency in cpufreq units
141  *
142  * Stores the per cpu model P state limits and current P state.
143  */
144 struct pstate_data {
145 	int	current_pstate;
146 	int	min_pstate;
147 	int	max_pstate;
148 	int	max_pstate_physical;
149 	int	scaling;
150 	int	turbo_pstate;
151 	unsigned int max_freq;
152 	unsigned int turbo_freq;
153 };
154 
155 /**
156  * struct vid_data -	Stores voltage information data
157  * @min:		VID data for this platform corresponding to
158  *			the lowest P state
159  * @max:		VID data corresponding to the highest P State.
160  * @turbo:		VID data for turbo P state
161  * @ratio:		Ratio of (vid max - vid min) /
162  *			(max P state - Min P State)
163  *
164  * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling)
165  * This data is used in Atom platforms, where in addition to target P state,
166  * the voltage data needs to be specified to select next P State.
167  */
168 struct vid_data {
169 	int min;
170 	int max;
171 	int turbo;
172 	int32_t ratio;
173 };
174 
175 /**
176  * struct global_params - Global parameters, mostly tunable via sysfs.
177  * @no_turbo:		Whether or not to use turbo P-states.
178  * @turbo_disabled:	Whethet or not turbo P-states are available at all,
179  *			based on the MSR_IA32_MISC_ENABLE value and whether or
180  *			not the maximum reported turbo P-state is different from
181  *			the maximum reported non-turbo one.
182  * @turbo_disabled_mf:	The @turbo_disabled value reflected by cpuinfo.max_freq.
183  * @min_perf_pct:	Minimum capacity limit in percent of the maximum turbo
184  *			P-state capacity.
185  * @max_perf_pct:	Maximum capacity limit in percent of the maximum turbo
186  *			P-state capacity.
187  */
188 struct global_params {
189 	bool no_turbo;
190 	bool turbo_disabled;
191 	bool turbo_disabled_mf;
192 	int max_perf_pct;
193 	int min_perf_pct;
194 };
195 
196 /**
197  * struct cpudata -	Per CPU instance data storage
198  * @cpu:		CPU number for this instance data
199  * @policy:		CPUFreq policy value
200  * @update_util:	CPUFreq utility callback information
201  * @update_util_set:	CPUFreq utility callback is set
202  * @iowait_boost:	iowait-related boost fraction
203  * @last_update:	Time of the last update.
204  * @pstate:		Stores P state limits for this CPU
205  * @vid:		Stores VID limits for this CPU
206  * @last_sample_time:	Last Sample time
207  * @aperf_mperf_shift:	Number of clock cycles after aperf, merf is incremented
208  *			This shift is a multiplier to mperf delta to
209  *			calculate CPU busy.
210  * @prev_aperf:		Last APERF value read from APERF MSR
211  * @prev_mperf:		Last MPERF value read from MPERF MSR
212  * @prev_tsc:		Last timestamp counter (TSC) value
213  * @prev_cummulative_iowait: IO Wait time difference from last and
214  *			current sample
215  * @sample:		Storage for storing last Sample data
216  * @min_perf_ratio:	Minimum capacity in terms of PERF or HWP ratios
217  * @max_perf_ratio:	Maximum capacity in terms of PERF or HWP ratios
218  * @acpi_perf_data:	Stores ACPI perf information read from _PSS
219  * @valid_pss_table:	Set to true for valid ACPI _PSS entries found
220  * @epp_powersave:	Last saved HWP energy performance preference
221  *			(EPP) or energy performance bias (EPB),
222  *			when policy switched to performance
223  * @epp_policy:		Last saved policy used to set EPP/EPB
224  * @epp_default:	Power on default HWP energy performance
225  *			preference/bias
226  * @epp_saved:		Saved EPP/EPB during system suspend or CPU offline
227  *			operation
228  * @hwp_req_cached:	Cached value of the last HWP Request MSR
229  * @hwp_cap_cached:	Cached value of the last HWP Capabilities MSR
230  * @last_io_update:	Last time when IO wake flag was set
231  * @sched_flags:	Store scheduler flags for possible cross CPU update
232  * @hwp_boost_min:	Last HWP boosted min performance
233  *
234  * This structure stores per CPU instance data for all CPUs.
235  */
236 struct cpudata {
237 	int cpu;
238 
239 	unsigned int policy;
240 	struct update_util_data update_util;
241 	bool   update_util_set;
242 
243 	struct pstate_data pstate;
244 	struct vid_data vid;
245 
246 	u64	last_update;
247 	u64	last_sample_time;
248 	u64	aperf_mperf_shift;
249 	u64	prev_aperf;
250 	u64	prev_mperf;
251 	u64	prev_tsc;
252 	u64	prev_cummulative_iowait;
253 	struct sample sample;
254 	int32_t	min_perf_ratio;
255 	int32_t	max_perf_ratio;
256 #ifdef CONFIG_ACPI
257 	struct acpi_processor_performance acpi_perf_data;
258 	bool valid_pss_table;
259 #endif
260 	unsigned int iowait_boost;
261 	s16 epp_powersave;
262 	s16 epp_policy;
263 	s16 epp_default;
264 	s16 epp_saved;
265 	u64 hwp_req_cached;
266 	u64 hwp_cap_cached;
267 	u64 last_io_update;
268 	unsigned int sched_flags;
269 	u32 hwp_boost_min;
270 };
271 
272 static struct cpudata **all_cpu_data;
273 
274 /**
275  * struct pstate_funcs - Per CPU model specific callbacks
276  * @get_max:		Callback to get maximum non turbo effective P state
277  * @get_max_physical:	Callback to get maximum non turbo physical P state
278  * @get_min:		Callback to get minimum P state
279  * @get_turbo:		Callback to get turbo P state
280  * @get_scaling:	Callback to get frequency scaling factor
281  * @get_val:		Callback to convert P state to actual MSR write value
282  * @get_vid:		Callback to get VID data for Atom platforms
283  *
284  * Core and Atom CPU models have different way to get P State limits. This
285  * structure is used to store those callbacks.
286  */
287 struct pstate_funcs {
288 	int (*get_max)(void);
289 	int (*get_max_physical)(void);
290 	int (*get_min)(void);
291 	int (*get_turbo)(void);
292 	int (*get_scaling)(void);
293 	int (*get_aperf_mperf_shift)(void);
294 	u64 (*get_val)(struct cpudata*, int pstate);
295 	void (*get_vid)(struct cpudata *);
296 };
297 
298 static struct pstate_funcs pstate_funcs __read_mostly;
299 
300 static int hwp_active __read_mostly;
301 static int hwp_mode_bdw __read_mostly;
302 static bool per_cpu_limits __read_mostly;
303 static bool hwp_boost __read_mostly;
304 
305 static struct cpufreq_driver *intel_pstate_driver __read_mostly;
306 
307 #ifdef CONFIG_ACPI
308 static bool acpi_ppc;
309 #endif
310 
311 static struct global_params global;
312 
313 static DEFINE_MUTEX(intel_pstate_driver_lock);
314 static DEFINE_MUTEX(intel_pstate_limits_lock);
315 
316 #ifdef CONFIG_ACPI
317 
318 static bool intel_pstate_acpi_pm_profile_server(void)
319 {
320 	if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER ||
321 	    acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER)
322 		return true;
323 
324 	return false;
325 }
326 
327 static bool intel_pstate_get_ppc_enable_status(void)
328 {
329 	if (intel_pstate_acpi_pm_profile_server())
330 		return true;
331 
332 	return acpi_ppc;
333 }
334 
335 #ifdef CONFIG_ACPI_CPPC_LIB
336 
337 /* The work item is needed to avoid CPU hotplug locking issues */
338 static void intel_pstste_sched_itmt_work_fn(struct work_struct *work)
339 {
340 	sched_set_itmt_support();
341 }
342 
343 static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn);
344 
345 static void intel_pstate_set_itmt_prio(int cpu)
346 {
347 	struct cppc_perf_caps cppc_perf;
348 	static u32 max_highest_perf = 0, min_highest_perf = U32_MAX;
349 	int ret;
350 
351 	ret = cppc_get_perf_caps(cpu, &cppc_perf);
352 	if (ret)
353 		return;
354 
355 	/*
356 	 * The priorities can be set regardless of whether or not
357 	 * sched_set_itmt_support(true) has been called and it is valid to
358 	 * update them at any time after it has been called.
359 	 */
360 	sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu);
361 
362 	if (max_highest_perf <= min_highest_perf) {
363 		if (cppc_perf.highest_perf > max_highest_perf)
364 			max_highest_perf = cppc_perf.highest_perf;
365 
366 		if (cppc_perf.highest_perf < min_highest_perf)
367 			min_highest_perf = cppc_perf.highest_perf;
368 
369 		if (max_highest_perf > min_highest_perf) {
370 			/*
371 			 * This code can be run during CPU online under the
372 			 * CPU hotplug locks, so sched_set_itmt_support()
373 			 * cannot be called from here.  Queue up a work item
374 			 * to invoke it.
375 			 */
376 			schedule_work(&sched_itmt_work);
377 		}
378 	}
379 }
380 
381 static int intel_pstate_get_cppc_guranteed(int cpu)
382 {
383 	struct cppc_perf_caps cppc_perf;
384 	int ret;
385 
386 	ret = cppc_get_perf_caps(cpu, &cppc_perf);
387 	if (ret)
388 		return ret;
389 
390 	if (cppc_perf.guaranteed_perf)
391 		return cppc_perf.guaranteed_perf;
392 
393 	return cppc_perf.nominal_perf;
394 }
395 
396 #else /* CONFIG_ACPI_CPPC_LIB */
397 static void intel_pstate_set_itmt_prio(int cpu)
398 {
399 }
400 #endif /* CONFIG_ACPI_CPPC_LIB */
401 
402 static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
403 {
404 	struct cpudata *cpu;
405 	int ret;
406 	int i;
407 
408 	if (hwp_active) {
409 		intel_pstate_set_itmt_prio(policy->cpu);
410 		return;
411 	}
412 
413 	if (!intel_pstate_get_ppc_enable_status())
414 		return;
415 
416 	cpu = all_cpu_data[policy->cpu];
417 
418 	ret = acpi_processor_register_performance(&cpu->acpi_perf_data,
419 						  policy->cpu);
420 	if (ret)
421 		return;
422 
423 	/*
424 	 * Check if the control value in _PSS is for PERF_CTL MSR, which should
425 	 * guarantee that the states returned by it map to the states in our
426 	 * list directly.
427 	 */
428 	if (cpu->acpi_perf_data.control_register.space_id !=
429 						ACPI_ADR_SPACE_FIXED_HARDWARE)
430 		goto err;
431 
432 	/*
433 	 * If there is only one entry _PSS, simply ignore _PSS and continue as
434 	 * usual without taking _PSS into account
435 	 */
436 	if (cpu->acpi_perf_data.state_count < 2)
437 		goto err;
438 
439 	pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu);
440 	for (i = 0; i < cpu->acpi_perf_data.state_count; i++) {
441 		pr_debug("     %cP%d: %u MHz, %u mW, 0x%x\n",
442 			 (i == cpu->acpi_perf_data.state ? '*' : ' '), i,
443 			 (u32) cpu->acpi_perf_data.states[i].core_frequency,
444 			 (u32) cpu->acpi_perf_data.states[i].power,
445 			 (u32) cpu->acpi_perf_data.states[i].control);
446 	}
447 
448 	/*
449 	 * The _PSS table doesn't contain whole turbo frequency range.
450 	 * This just contains +1 MHZ above the max non turbo frequency,
451 	 * with control value corresponding to max turbo ratio. But
452 	 * when cpufreq set policy is called, it will call with this
453 	 * max frequency, which will cause a reduced performance as
454 	 * this driver uses real max turbo frequency as the max
455 	 * frequency. So correct this frequency in _PSS table to
456 	 * correct max turbo frequency based on the turbo state.
457 	 * Also need to convert to MHz as _PSS freq is in MHz.
458 	 */
459 	if (!global.turbo_disabled)
460 		cpu->acpi_perf_data.states[0].core_frequency =
461 					policy->cpuinfo.max_freq / 1000;
462 	cpu->valid_pss_table = true;
463 	pr_debug("_PPC limits will be enforced\n");
464 
465 	return;
466 
467  err:
468 	cpu->valid_pss_table = false;
469 	acpi_processor_unregister_performance(policy->cpu);
470 }
471 
472 static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
473 {
474 	struct cpudata *cpu;
475 
476 	cpu = all_cpu_data[policy->cpu];
477 	if (!cpu->valid_pss_table)
478 		return;
479 
480 	acpi_processor_unregister_performance(policy->cpu);
481 }
482 #else /* CONFIG_ACPI */
483 static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
484 {
485 }
486 
487 static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
488 {
489 }
490 
491 static inline bool intel_pstate_acpi_pm_profile_server(void)
492 {
493 	return false;
494 }
495 #endif /* CONFIG_ACPI */
496 
497 #ifndef CONFIG_ACPI_CPPC_LIB
498 static int intel_pstate_get_cppc_guranteed(int cpu)
499 {
500 	return -ENOTSUPP;
501 }
502 #endif /* CONFIG_ACPI_CPPC_LIB */
503 
504 static inline void update_turbo_state(void)
505 {
506 	u64 misc_en;
507 	struct cpudata *cpu;
508 
509 	cpu = all_cpu_data[0];
510 	rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
511 	global.turbo_disabled =
512 		(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ||
513 		 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
514 }
515 
516 static int min_perf_pct_min(void)
517 {
518 	struct cpudata *cpu = all_cpu_data[0];
519 	int turbo_pstate = cpu->pstate.turbo_pstate;
520 
521 	return turbo_pstate ?
522 		(cpu->pstate.min_pstate * 100 / turbo_pstate) : 0;
523 }
524 
525 static s16 intel_pstate_get_epb(struct cpudata *cpu_data)
526 {
527 	u64 epb;
528 	int ret;
529 
530 	if (!boot_cpu_has(X86_FEATURE_EPB))
531 		return -ENXIO;
532 
533 	ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
534 	if (ret)
535 		return (s16)ret;
536 
537 	return (s16)(epb & 0x0f);
538 }
539 
540 static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data)
541 {
542 	s16 epp;
543 
544 	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
545 		/*
546 		 * When hwp_req_data is 0, means that caller didn't read
547 		 * MSR_HWP_REQUEST, so need to read and get EPP.
548 		 */
549 		if (!hwp_req_data) {
550 			epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST,
551 					    &hwp_req_data);
552 			if (epp)
553 				return epp;
554 		}
555 		epp = (hwp_req_data >> 24) & 0xff;
556 	} else {
557 		/* When there is no EPP present, HWP uses EPB settings */
558 		epp = intel_pstate_get_epb(cpu_data);
559 	}
560 
561 	return epp;
562 }
563 
564 static int intel_pstate_set_epb(int cpu, s16 pref)
565 {
566 	u64 epb;
567 	int ret;
568 
569 	if (!boot_cpu_has(X86_FEATURE_EPB))
570 		return -ENXIO;
571 
572 	ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
573 	if (ret)
574 		return ret;
575 
576 	epb = (epb & ~0x0f) | pref;
577 	wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb);
578 
579 	return 0;
580 }
581 
582 /*
583  * EPP/EPB display strings corresponding to EPP index in the
584  * energy_perf_strings[]
585  *	index		String
586  *-------------------------------------
587  *	0		default
588  *	1		performance
589  *	2		balance_performance
590  *	3		balance_power
591  *	4		power
592  */
593 static const char * const energy_perf_strings[] = {
594 	"default",
595 	"performance",
596 	"balance_performance",
597 	"balance_power",
598 	"power",
599 	NULL
600 };
601 static const unsigned int epp_values[] = {
602 	HWP_EPP_PERFORMANCE,
603 	HWP_EPP_BALANCE_PERFORMANCE,
604 	HWP_EPP_BALANCE_POWERSAVE,
605 	HWP_EPP_POWERSAVE
606 };
607 
608 static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data)
609 {
610 	s16 epp;
611 	int index = -EINVAL;
612 
613 	epp = intel_pstate_get_epp(cpu_data, 0);
614 	if (epp < 0)
615 		return epp;
616 
617 	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
618 		if (epp == HWP_EPP_PERFORMANCE)
619 			return 1;
620 		if (epp <= HWP_EPP_BALANCE_PERFORMANCE)
621 			return 2;
622 		if (epp <= HWP_EPP_BALANCE_POWERSAVE)
623 			return 3;
624 		else
625 			return 4;
626 	} else if (boot_cpu_has(X86_FEATURE_EPB)) {
627 		/*
628 		 * Range:
629 		 *	0x00-0x03	:	Performance
630 		 *	0x04-0x07	:	Balance performance
631 		 *	0x08-0x0B	:	Balance power
632 		 *	0x0C-0x0F	:	Power
633 		 * The EPB is a 4 bit value, but our ranges restrict the
634 		 * value which can be set. Here only using top two bits
635 		 * effectively.
636 		 */
637 		index = (epp >> 2) + 1;
638 	}
639 
640 	return index;
641 }
642 
643 static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data,
644 					      int pref_index)
645 {
646 	int epp = -EINVAL;
647 	int ret;
648 
649 	if (!pref_index)
650 		epp = cpu_data->epp_default;
651 
652 	mutex_lock(&intel_pstate_limits_lock);
653 
654 	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
655 		u64 value;
656 
657 		ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, &value);
658 		if (ret)
659 			goto return_pref;
660 
661 		value &= ~GENMASK_ULL(31, 24);
662 
663 		if (epp == -EINVAL)
664 			epp = epp_values[pref_index - 1];
665 
666 		value |= (u64)epp << 24;
667 		ret = wrmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, value);
668 	} else {
669 		if (epp == -EINVAL)
670 			epp = (pref_index - 1) << 2;
671 		ret = intel_pstate_set_epb(cpu_data->cpu, epp);
672 	}
673 return_pref:
674 	mutex_unlock(&intel_pstate_limits_lock);
675 
676 	return ret;
677 }
678 
679 static ssize_t show_energy_performance_available_preferences(
680 				struct cpufreq_policy *policy, char *buf)
681 {
682 	int i = 0;
683 	int ret = 0;
684 
685 	while (energy_perf_strings[i] != NULL)
686 		ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
687 
688 	ret += sprintf(&buf[ret], "\n");
689 
690 	return ret;
691 }
692 
693 cpufreq_freq_attr_ro(energy_performance_available_preferences);
694 
695 static ssize_t store_energy_performance_preference(
696 		struct cpufreq_policy *policy, const char *buf, size_t count)
697 {
698 	struct cpudata *cpu_data = all_cpu_data[policy->cpu];
699 	char str_preference[21];
700 	int ret;
701 
702 	ret = sscanf(buf, "%20s", str_preference);
703 	if (ret != 1)
704 		return -EINVAL;
705 
706 	ret = match_string(energy_perf_strings, -1, str_preference);
707 	if (ret < 0)
708 		return ret;
709 
710 	intel_pstate_set_energy_pref_index(cpu_data, ret);
711 	return count;
712 }
713 
714 static ssize_t show_energy_performance_preference(
715 				struct cpufreq_policy *policy, char *buf)
716 {
717 	struct cpudata *cpu_data = all_cpu_data[policy->cpu];
718 	int preference;
719 
720 	preference = intel_pstate_get_energy_pref_index(cpu_data);
721 	if (preference < 0)
722 		return preference;
723 
724 	return  sprintf(buf, "%s\n", energy_perf_strings[preference]);
725 }
726 
727 cpufreq_freq_attr_rw(energy_performance_preference);
728 
729 static ssize_t show_base_frequency(struct cpufreq_policy *policy, char *buf)
730 {
731 	struct cpudata *cpu;
732 	u64 cap;
733 	int ratio;
734 
735 	ratio = intel_pstate_get_cppc_guranteed(policy->cpu);
736 	if (ratio <= 0) {
737 		rdmsrl_on_cpu(policy->cpu, MSR_HWP_CAPABILITIES, &cap);
738 		ratio = HWP_GUARANTEED_PERF(cap);
739 	}
740 
741 	cpu = all_cpu_data[policy->cpu];
742 
743 	return sprintf(buf, "%d\n", ratio * cpu->pstate.scaling);
744 }
745 
746 cpufreq_freq_attr_ro(base_frequency);
747 
748 static struct freq_attr *hwp_cpufreq_attrs[] = {
749 	&energy_performance_preference,
750 	&energy_performance_available_preferences,
751 	&base_frequency,
752 	NULL,
753 };
754 
755 static void intel_pstate_get_hwp_max(unsigned int cpu, int *phy_max,
756 				     int *current_max)
757 {
758 	u64 cap;
759 
760 	rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap);
761 	WRITE_ONCE(all_cpu_data[cpu]->hwp_cap_cached, cap);
762 	if (global.no_turbo)
763 		*current_max = HWP_GUARANTEED_PERF(cap);
764 	else
765 		*current_max = HWP_HIGHEST_PERF(cap);
766 
767 	*phy_max = HWP_HIGHEST_PERF(cap);
768 }
769 
770 static void intel_pstate_hwp_set(unsigned int cpu)
771 {
772 	struct cpudata *cpu_data = all_cpu_data[cpu];
773 	int max, min;
774 	u64 value;
775 	s16 epp;
776 
777 	max = cpu_data->max_perf_ratio;
778 	min = cpu_data->min_perf_ratio;
779 
780 	if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE)
781 		min = max;
782 
783 	rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
784 
785 	value &= ~HWP_MIN_PERF(~0L);
786 	value |= HWP_MIN_PERF(min);
787 
788 	value &= ~HWP_MAX_PERF(~0L);
789 	value |= HWP_MAX_PERF(max);
790 
791 	if (cpu_data->epp_policy == cpu_data->policy)
792 		goto skip_epp;
793 
794 	cpu_data->epp_policy = cpu_data->policy;
795 
796 	if (cpu_data->epp_saved >= 0) {
797 		epp = cpu_data->epp_saved;
798 		cpu_data->epp_saved = -EINVAL;
799 		goto update_epp;
800 	}
801 
802 	if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
803 		epp = intel_pstate_get_epp(cpu_data, value);
804 		cpu_data->epp_powersave = epp;
805 		/* If EPP read was failed, then don't try to write */
806 		if (epp < 0)
807 			goto skip_epp;
808 
809 		epp = 0;
810 	} else {
811 		/* skip setting EPP, when saved value is invalid */
812 		if (cpu_data->epp_powersave < 0)
813 			goto skip_epp;
814 
815 		/*
816 		 * No need to restore EPP when it is not zero. This
817 		 * means:
818 		 *  - Policy is not changed
819 		 *  - user has manually changed
820 		 *  - Error reading EPB
821 		 */
822 		epp = intel_pstate_get_epp(cpu_data, value);
823 		if (epp)
824 			goto skip_epp;
825 
826 		epp = cpu_data->epp_powersave;
827 	}
828 update_epp:
829 	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
830 		value &= ~GENMASK_ULL(31, 24);
831 		value |= (u64)epp << 24;
832 	} else {
833 		intel_pstate_set_epb(cpu, epp);
834 	}
835 skip_epp:
836 	WRITE_ONCE(cpu_data->hwp_req_cached, value);
837 	wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
838 }
839 
840 static void intel_pstate_hwp_force_min_perf(int cpu)
841 {
842 	u64 value;
843 	int min_perf;
844 
845 	value = all_cpu_data[cpu]->hwp_req_cached;
846 	value &= ~GENMASK_ULL(31, 0);
847 	min_perf = HWP_LOWEST_PERF(all_cpu_data[cpu]->hwp_cap_cached);
848 
849 	/* Set hwp_max = hwp_min */
850 	value |= HWP_MAX_PERF(min_perf);
851 	value |= HWP_MIN_PERF(min_perf);
852 
853 	/* Set EPP/EPB to min */
854 	if (boot_cpu_has(X86_FEATURE_HWP_EPP))
855 		value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE);
856 	else
857 		intel_pstate_set_epb(cpu, HWP_EPP_BALANCE_POWERSAVE);
858 
859 	wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
860 }
861 
862 static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy)
863 {
864 	struct cpudata *cpu_data = all_cpu_data[policy->cpu];
865 
866 	if (!hwp_active)
867 		return 0;
868 
869 	cpu_data->epp_saved = intel_pstate_get_epp(cpu_data, 0);
870 
871 	return 0;
872 }
873 
874 static void intel_pstate_hwp_enable(struct cpudata *cpudata);
875 
876 static int intel_pstate_resume(struct cpufreq_policy *policy)
877 {
878 	if (!hwp_active)
879 		return 0;
880 
881 	mutex_lock(&intel_pstate_limits_lock);
882 
883 	if (policy->cpu == 0)
884 		intel_pstate_hwp_enable(all_cpu_data[policy->cpu]);
885 
886 	all_cpu_data[policy->cpu]->epp_policy = 0;
887 	intel_pstate_hwp_set(policy->cpu);
888 
889 	mutex_unlock(&intel_pstate_limits_lock);
890 
891 	return 0;
892 }
893 
894 static void intel_pstate_update_policies(void)
895 {
896 	int cpu;
897 
898 	for_each_possible_cpu(cpu)
899 		cpufreq_update_policy(cpu);
900 }
901 
902 static void intel_pstate_update_max_freq(unsigned int cpu)
903 {
904 	struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
905 	struct cpufreq_policy new_policy;
906 	struct cpudata *cpudata;
907 
908 	if (!policy)
909 		return;
910 
911 	cpudata = all_cpu_data[cpu];
912 	policy->cpuinfo.max_freq = global.turbo_disabled_mf ?
913 			cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
914 
915 	memcpy(&new_policy, policy, sizeof(*policy));
916 	new_policy.max = min(policy->user_policy.max, policy->cpuinfo.max_freq);
917 	new_policy.min = min(policy->user_policy.min, new_policy.max);
918 
919 	cpufreq_set_policy(policy, &new_policy);
920 
921 	cpufreq_cpu_release(policy);
922 }
923 
924 static void intel_pstate_update_limits(unsigned int cpu)
925 {
926 	mutex_lock(&intel_pstate_driver_lock);
927 
928 	update_turbo_state();
929 	/*
930 	 * If turbo has been turned on or off globally, policy limits for
931 	 * all CPUs need to be updated to reflect that.
932 	 */
933 	if (global.turbo_disabled_mf != global.turbo_disabled) {
934 		global.turbo_disabled_mf = global.turbo_disabled;
935 		for_each_possible_cpu(cpu)
936 			intel_pstate_update_max_freq(cpu);
937 	} else {
938 		cpufreq_update_policy(cpu);
939 	}
940 
941 	mutex_unlock(&intel_pstate_driver_lock);
942 }
943 
944 /************************** sysfs begin ************************/
945 #define show_one(file_name, object)					\
946 	static ssize_t show_##file_name					\
947 	(struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
948 	{								\
949 		return sprintf(buf, "%u\n", global.object);		\
950 	}
951 
952 static ssize_t intel_pstate_show_status(char *buf);
953 static int intel_pstate_update_status(const char *buf, size_t size);
954 
955 static ssize_t show_status(struct kobject *kobj,
956 			   struct kobj_attribute *attr, char *buf)
957 {
958 	ssize_t ret;
959 
960 	mutex_lock(&intel_pstate_driver_lock);
961 	ret = intel_pstate_show_status(buf);
962 	mutex_unlock(&intel_pstate_driver_lock);
963 
964 	return ret;
965 }
966 
967 static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
968 			    const char *buf, size_t count)
969 {
970 	char *p = memchr(buf, '\n', count);
971 	int ret;
972 
973 	mutex_lock(&intel_pstate_driver_lock);
974 	ret = intel_pstate_update_status(buf, p ? p - buf : count);
975 	mutex_unlock(&intel_pstate_driver_lock);
976 
977 	return ret < 0 ? ret : count;
978 }
979 
980 static ssize_t show_turbo_pct(struct kobject *kobj,
981 				struct kobj_attribute *attr, char *buf)
982 {
983 	struct cpudata *cpu;
984 	int total, no_turbo, turbo_pct;
985 	uint32_t turbo_fp;
986 
987 	mutex_lock(&intel_pstate_driver_lock);
988 
989 	if (!intel_pstate_driver) {
990 		mutex_unlock(&intel_pstate_driver_lock);
991 		return -EAGAIN;
992 	}
993 
994 	cpu = all_cpu_data[0];
995 
996 	total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
997 	no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1;
998 	turbo_fp = div_fp(no_turbo, total);
999 	turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100)));
1000 
1001 	mutex_unlock(&intel_pstate_driver_lock);
1002 
1003 	return sprintf(buf, "%u\n", turbo_pct);
1004 }
1005 
1006 static ssize_t show_num_pstates(struct kobject *kobj,
1007 				struct kobj_attribute *attr, char *buf)
1008 {
1009 	struct cpudata *cpu;
1010 	int total;
1011 
1012 	mutex_lock(&intel_pstate_driver_lock);
1013 
1014 	if (!intel_pstate_driver) {
1015 		mutex_unlock(&intel_pstate_driver_lock);
1016 		return -EAGAIN;
1017 	}
1018 
1019 	cpu = all_cpu_data[0];
1020 	total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
1021 
1022 	mutex_unlock(&intel_pstate_driver_lock);
1023 
1024 	return sprintf(buf, "%u\n", total);
1025 }
1026 
1027 static ssize_t show_no_turbo(struct kobject *kobj,
1028 			     struct kobj_attribute *attr, char *buf)
1029 {
1030 	ssize_t ret;
1031 
1032 	mutex_lock(&intel_pstate_driver_lock);
1033 
1034 	if (!intel_pstate_driver) {
1035 		mutex_unlock(&intel_pstate_driver_lock);
1036 		return -EAGAIN;
1037 	}
1038 
1039 	update_turbo_state();
1040 	if (global.turbo_disabled)
1041 		ret = sprintf(buf, "%u\n", global.turbo_disabled);
1042 	else
1043 		ret = sprintf(buf, "%u\n", global.no_turbo);
1044 
1045 	mutex_unlock(&intel_pstate_driver_lock);
1046 
1047 	return ret;
1048 }
1049 
1050 static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b,
1051 			      const char *buf, size_t count)
1052 {
1053 	unsigned int input;
1054 	int ret;
1055 
1056 	ret = sscanf(buf, "%u", &input);
1057 	if (ret != 1)
1058 		return -EINVAL;
1059 
1060 	mutex_lock(&intel_pstate_driver_lock);
1061 
1062 	if (!intel_pstate_driver) {
1063 		mutex_unlock(&intel_pstate_driver_lock);
1064 		return -EAGAIN;
1065 	}
1066 
1067 	mutex_lock(&intel_pstate_limits_lock);
1068 
1069 	update_turbo_state();
1070 	if (global.turbo_disabled) {
1071 		pr_warn("Turbo disabled by BIOS or unavailable on processor\n");
1072 		mutex_unlock(&intel_pstate_limits_lock);
1073 		mutex_unlock(&intel_pstate_driver_lock);
1074 		return -EPERM;
1075 	}
1076 
1077 	global.no_turbo = clamp_t(int, input, 0, 1);
1078 
1079 	if (global.no_turbo) {
1080 		struct cpudata *cpu = all_cpu_data[0];
1081 		int pct = cpu->pstate.max_pstate * 100 / cpu->pstate.turbo_pstate;
1082 
1083 		/* Squash the global minimum into the permitted range. */
1084 		if (global.min_perf_pct > pct)
1085 			global.min_perf_pct = pct;
1086 	}
1087 
1088 	mutex_unlock(&intel_pstate_limits_lock);
1089 
1090 	intel_pstate_update_policies();
1091 
1092 	mutex_unlock(&intel_pstate_driver_lock);
1093 
1094 	return count;
1095 }
1096 
1097 static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b,
1098 				  const char *buf, size_t count)
1099 {
1100 	unsigned int input;
1101 	int ret;
1102 
1103 	ret = sscanf(buf, "%u", &input);
1104 	if (ret != 1)
1105 		return -EINVAL;
1106 
1107 	mutex_lock(&intel_pstate_driver_lock);
1108 
1109 	if (!intel_pstate_driver) {
1110 		mutex_unlock(&intel_pstate_driver_lock);
1111 		return -EAGAIN;
1112 	}
1113 
1114 	mutex_lock(&intel_pstate_limits_lock);
1115 
1116 	global.max_perf_pct = clamp_t(int, input, global.min_perf_pct, 100);
1117 
1118 	mutex_unlock(&intel_pstate_limits_lock);
1119 
1120 	intel_pstate_update_policies();
1121 
1122 	mutex_unlock(&intel_pstate_driver_lock);
1123 
1124 	return count;
1125 }
1126 
1127 static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b,
1128 				  const char *buf, size_t count)
1129 {
1130 	unsigned int input;
1131 	int ret;
1132 
1133 	ret = sscanf(buf, "%u", &input);
1134 	if (ret != 1)
1135 		return -EINVAL;
1136 
1137 	mutex_lock(&intel_pstate_driver_lock);
1138 
1139 	if (!intel_pstate_driver) {
1140 		mutex_unlock(&intel_pstate_driver_lock);
1141 		return -EAGAIN;
1142 	}
1143 
1144 	mutex_lock(&intel_pstate_limits_lock);
1145 
1146 	global.min_perf_pct = clamp_t(int, input,
1147 				      min_perf_pct_min(), global.max_perf_pct);
1148 
1149 	mutex_unlock(&intel_pstate_limits_lock);
1150 
1151 	intel_pstate_update_policies();
1152 
1153 	mutex_unlock(&intel_pstate_driver_lock);
1154 
1155 	return count;
1156 }
1157 
1158 static ssize_t show_hwp_dynamic_boost(struct kobject *kobj,
1159 				struct kobj_attribute *attr, char *buf)
1160 {
1161 	return sprintf(buf, "%u\n", hwp_boost);
1162 }
1163 
1164 static ssize_t store_hwp_dynamic_boost(struct kobject *a,
1165 				       struct kobj_attribute *b,
1166 				       const char *buf, size_t count)
1167 {
1168 	unsigned int input;
1169 	int ret;
1170 
1171 	ret = kstrtouint(buf, 10, &input);
1172 	if (ret)
1173 		return ret;
1174 
1175 	mutex_lock(&intel_pstate_driver_lock);
1176 	hwp_boost = !!input;
1177 	intel_pstate_update_policies();
1178 	mutex_unlock(&intel_pstate_driver_lock);
1179 
1180 	return count;
1181 }
1182 
1183 show_one(max_perf_pct, max_perf_pct);
1184 show_one(min_perf_pct, min_perf_pct);
1185 
1186 define_one_global_rw(status);
1187 define_one_global_rw(no_turbo);
1188 define_one_global_rw(max_perf_pct);
1189 define_one_global_rw(min_perf_pct);
1190 define_one_global_ro(turbo_pct);
1191 define_one_global_ro(num_pstates);
1192 define_one_global_rw(hwp_dynamic_boost);
1193 
1194 static struct attribute *intel_pstate_attributes[] = {
1195 	&status.attr,
1196 	&no_turbo.attr,
1197 	&turbo_pct.attr,
1198 	&num_pstates.attr,
1199 	NULL
1200 };
1201 
1202 static const struct attribute_group intel_pstate_attr_group = {
1203 	.attrs = intel_pstate_attributes,
1204 };
1205 
1206 static void __init intel_pstate_sysfs_expose_params(void)
1207 {
1208 	struct kobject *intel_pstate_kobject;
1209 	int rc;
1210 
1211 	intel_pstate_kobject = kobject_create_and_add("intel_pstate",
1212 						&cpu_subsys.dev_root->kobj);
1213 	if (WARN_ON(!intel_pstate_kobject))
1214 		return;
1215 
1216 	rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group);
1217 	if (WARN_ON(rc))
1218 		return;
1219 
1220 	/*
1221 	 * If per cpu limits are enforced there are no global limits, so
1222 	 * return without creating max/min_perf_pct attributes
1223 	 */
1224 	if (per_cpu_limits)
1225 		return;
1226 
1227 	rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr);
1228 	WARN_ON(rc);
1229 
1230 	rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr);
1231 	WARN_ON(rc);
1232 
1233 	if (hwp_active) {
1234 		rc = sysfs_create_file(intel_pstate_kobject,
1235 				       &hwp_dynamic_boost.attr);
1236 		WARN_ON(rc);
1237 	}
1238 }
1239 /************************** sysfs end ************************/
1240 
1241 static void intel_pstate_hwp_enable(struct cpudata *cpudata)
1242 {
1243 	/* First disable HWP notification interrupt as we don't process them */
1244 	if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
1245 		wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
1246 
1247 	wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
1248 	cpudata->epp_policy = 0;
1249 	if (cpudata->epp_default == -EINVAL)
1250 		cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
1251 }
1252 
1253 #define MSR_IA32_POWER_CTL_BIT_EE	19
1254 
1255 /* Disable energy efficiency optimization */
1256 static void intel_pstate_disable_ee(int cpu)
1257 {
1258 	u64 power_ctl;
1259 	int ret;
1260 
1261 	ret = rdmsrl_on_cpu(cpu, MSR_IA32_POWER_CTL, &power_ctl);
1262 	if (ret)
1263 		return;
1264 
1265 	if (!(power_ctl & BIT(MSR_IA32_POWER_CTL_BIT_EE))) {
1266 		pr_info("Disabling energy efficiency optimization\n");
1267 		power_ctl |= BIT(MSR_IA32_POWER_CTL_BIT_EE);
1268 		wrmsrl_on_cpu(cpu, MSR_IA32_POWER_CTL, power_ctl);
1269 	}
1270 }
1271 
1272 static int atom_get_min_pstate(void)
1273 {
1274 	u64 value;
1275 
1276 	rdmsrl(MSR_ATOM_CORE_RATIOS, value);
1277 	return (value >> 8) & 0x7F;
1278 }
1279 
1280 static int atom_get_max_pstate(void)
1281 {
1282 	u64 value;
1283 
1284 	rdmsrl(MSR_ATOM_CORE_RATIOS, value);
1285 	return (value >> 16) & 0x7F;
1286 }
1287 
1288 static int atom_get_turbo_pstate(void)
1289 {
1290 	u64 value;
1291 
1292 	rdmsrl(MSR_ATOM_CORE_TURBO_RATIOS, value);
1293 	return value & 0x7F;
1294 }
1295 
1296 static u64 atom_get_val(struct cpudata *cpudata, int pstate)
1297 {
1298 	u64 val;
1299 	int32_t vid_fp;
1300 	u32 vid;
1301 
1302 	val = (u64)pstate << 8;
1303 	if (global.no_turbo && !global.turbo_disabled)
1304 		val |= (u64)1 << 32;
1305 
1306 	vid_fp = cpudata->vid.min + mul_fp(
1307 		int_tofp(pstate - cpudata->pstate.min_pstate),
1308 		cpudata->vid.ratio);
1309 
1310 	vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max);
1311 	vid = ceiling_fp(vid_fp);
1312 
1313 	if (pstate > cpudata->pstate.max_pstate)
1314 		vid = cpudata->vid.turbo;
1315 
1316 	return val | vid;
1317 }
1318 
1319 static int silvermont_get_scaling(void)
1320 {
1321 	u64 value;
1322 	int i;
1323 	/* Defined in Table 35-6 from SDM (Sept 2015) */
1324 	static int silvermont_freq_table[] = {
1325 		83300, 100000, 133300, 116700, 80000};
1326 
1327 	rdmsrl(MSR_FSB_FREQ, value);
1328 	i = value & 0x7;
1329 	WARN_ON(i > 4);
1330 
1331 	return silvermont_freq_table[i];
1332 }
1333 
1334 static int airmont_get_scaling(void)
1335 {
1336 	u64 value;
1337 	int i;
1338 	/* Defined in Table 35-10 from SDM (Sept 2015) */
1339 	static int airmont_freq_table[] = {
1340 		83300, 100000, 133300, 116700, 80000,
1341 		93300, 90000, 88900, 87500};
1342 
1343 	rdmsrl(MSR_FSB_FREQ, value);
1344 	i = value & 0xF;
1345 	WARN_ON(i > 8);
1346 
1347 	return airmont_freq_table[i];
1348 }
1349 
1350 static void atom_get_vid(struct cpudata *cpudata)
1351 {
1352 	u64 value;
1353 
1354 	rdmsrl(MSR_ATOM_CORE_VIDS, value);
1355 	cpudata->vid.min = int_tofp((value >> 8) & 0x7f);
1356 	cpudata->vid.max = int_tofp((value >> 16) & 0x7f);
1357 	cpudata->vid.ratio = div_fp(
1358 		cpudata->vid.max - cpudata->vid.min,
1359 		int_tofp(cpudata->pstate.max_pstate -
1360 			cpudata->pstate.min_pstate));
1361 
1362 	rdmsrl(MSR_ATOM_CORE_TURBO_VIDS, value);
1363 	cpudata->vid.turbo = value & 0x7f;
1364 }
1365 
1366 static int core_get_min_pstate(void)
1367 {
1368 	u64 value;
1369 
1370 	rdmsrl(MSR_PLATFORM_INFO, value);
1371 	return (value >> 40) & 0xFF;
1372 }
1373 
1374 static int core_get_max_pstate_physical(void)
1375 {
1376 	u64 value;
1377 
1378 	rdmsrl(MSR_PLATFORM_INFO, value);
1379 	return (value >> 8) & 0xFF;
1380 }
1381 
1382 static int core_get_tdp_ratio(u64 plat_info)
1383 {
1384 	/* Check how many TDP levels present */
1385 	if (plat_info & 0x600000000) {
1386 		u64 tdp_ctrl;
1387 		u64 tdp_ratio;
1388 		int tdp_msr;
1389 		int err;
1390 
1391 		/* Get the TDP level (0, 1, 2) to get ratios */
1392 		err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl);
1393 		if (err)
1394 			return err;
1395 
1396 		/* TDP MSR are continuous starting at 0x648 */
1397 		tdp_msr = MSR_CONFIG_TDP_NOMINAL + (tdp_ctrl & 0x03);
1398 		err = rdmsrl_safe(tdp_msr, &tdp_ratio);
1399 		if (err)
1400 			return err;
1401 
1402 		/* For level 1 and 2, bits[23:16] contain the ratio */
1403 		if (tdp_ctrl & 0x03)
1404 			tdp_ratio >>= 16;
1405 
1406 		tdp_ratio &= 0xff; /* ratios are only 8 bits long */
1407 		pr_debug("tdp_ratio %x\n", (int)tdp_ratio);
1408 
1409 		return (int)tdp_ratio;
1410 	}
1411 
1412 	return -ENXIO;
1413 }
1414 
1415 static int core_get_max_pstate(void)
1416 {
1417 	u64 tar;
1418 	u64 plat_info;
1419 	int max_pstate;
1420 	int tdp_ratio;
1421 	int err;
1422 
1423 	rdmsrl(MSR_PLATFORM_INFO, plat_info);
1424 	max_pstate = (plat_info >> 8) & 0xFF;
1425 
1426 	tdp_ratio = core_get_tdp_ratio(plat_info);
1427 	if (tdp_ratio <= 0)
1428 		return max_pstate;
1429 
1430 	if (hwp_active) {
1431 		/* Turbo activation ratio is not used on HWP platforms */
1432 		return tdp_ratio;
1433 	}
1434 
1435 	err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar);
1436 	if (!err) {
1437 		int tar_levels;
1438 
1439 		/* Do some sanity checking for safety */
1440 		tar_levels = tar & 0xff;
1441 		if (tdp_ratio - 1 == tar_levels) {
1442 			max_pstate = tar_levels;
1443 			pr_debug("max_pstate=TAC %x\n", max_pstate);
1444 		}
1445 	}
1446 
1447 	return max_pstate;
1448 }
1449 
1450 static int core_get_turbo_pstate(void)
1451 {
1452 	u64 value;
1453 	int nont, ret;
1454 
1455 	rdmsrl(MSR_TURBO_RATIO_LIMIT, value);
1456 	nont = core_get_max_pstate();
1457 	ret = (value) & 255;
1458 	if (ret <= nont)
1459 		ret = nont;
1460 	return ret;
1461 }
1462 
1463 static inline int core_get_scaling(void)
1464 {
1465 	return 100000;
1466 }
1467 
1468 static u64 core_get_val(struct cpudata *cpudata, int pstate)
1469 {
1470 	u64 val;
1471 
1472 	val = (u64)pstate << 8;
1473 	if (global.no_turbo && !global.turbo_disabled)
1474 		val |= (u64)1 << 32;
1475 
1476 	return val;
1477 }
1478 
1479 static int knl_get_aperf_mperf_shift(void)
1480 {
1481 	return 10;
1482 }
1483 
1484 static int knl_get_turbo_pstate(void)
1485 {
1486 	u64 value;
1487 	int nont, ret;
1488 
1489 	rdmsrl(MSR_TURBO_RATIO_LIMIT, value);
1490 	nont = core_get_max_pstate();
1491 	ret = (((value) >> 8) & 0xFF);
1492 	if (ret <= nont)
1493 		ret = nont;
1494 	return ret;
1495 }
1496 
1497 static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate)
1498 {
1499 	trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
1500 	cpu->pstate.current_pstate = pstate;
1501 	/*
1502 	 * Generally, there is no guarantee that this code will always run on
1503 	 * the CPU being updated, so force the register update to run on the
1504 	 * right CPU.
1505 	 */
1506 	wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
1507 		      pstate_funcs.get_val(cpu, pstate));
1508 }
1509 
1510 static void intel_pstate_set_min_pstate(struct cpudata *cpu)
1511 {
1512 	intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
1513 }
1514 
1515 static void intel_pstate_max_within_limits(struct cpudata *cpu)
1516 {
1517 	int pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio);
1518 
1519 	update_turbo_state();
1520 	intel_pstate_set_pstate(cpu, pstate);
1521 }
1522 
1523 static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
1524 {
1525 	cpu->pstate.min_pstate = pstate_funcs.get_min();
1526 	cpu->pstate.max_pstate = pstate_funcs.get_max();
1527 	cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical();
1528 	cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
1529 	cpu->pstate.scaling = pstate_funcs.get_scaling();
1530 	cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling;
1531 
1532 	if (hwp_active && !hwp_mode_bdw) {
1533 		unsigned int phy_max, current_max;
1534 
1535 		intel_pstate_get_hwp_max(cpu->cpu, &phy_max, &current_max);
1536 		cpu->pstate.turbo_freq = phy_max * cpu->pstate.scaling;
1537 	} else {
1538 		cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
1539 	}
1540 
1541 	if (pstate_funcs.get_aperf_mperf_shift)
1542 		cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift();
1543 
1544 	if (pstate_funcs.get_vid)
1545 		pstate_funcs.get_vid(cpu);
1546 
1547 	intel_pstate_set_min_pstate(cpu);
1548 }
1549 
1550 /*
1551  * Long hold time will keep high perf limits for long time,
1552  * which negatively impacts perf/watt for some workloads,
1553  * like specpower. 3ms is based on experiements on some
1554  * workoads.
1555  */
1556 static int hwp_boost_hold_time_ns = 3 * NSEC_PER_MSEC;
1557 
1558 static inline void intel_pstate_hwp_boost_up(struct cpudata *cpu)
1559 {
1560 	u64 hwp_req = READ_ONCE(cpu->hwp_req_cached);
1561 	u32 max_limit = (hwp_req & 0xff00) >> 8;
1562 	u32 min_limit = (hwp_req & 0xff);
1563 	u32 boost_level1;
1564 
1565 	/*
1566 	 * Cases to consider (User changes via sysfs or boot time):
1567 	 * If, P0 (Turbo max) = P1 (Guaranteed max) = min:
1568 	 *	No boost, return.
1569 	 * If, P0 (Turbo max) > P1 (Guaranteed max) = min:
1570 	 *     Should result in one level boost only for P0.
1571 	 * If, P0 (Turbo max) = P1 (Guaranteed max) > min:
1572 	 *     Should result in two level boost:
1573 	 *         (min + p1)/2 and P1.
1574 	 * If, P0 (Turbo max) > P1 (Guaranteed max) > min:
1575 	 *     Should result in three level boost:
1576 	 *        (min + p1)/2, P1 and P0.
1577 	 */
1578 
1579 	/* If max and min are equal or already at max, nothing to boost */
1580 	if (max_limit == min_limit || cpu->hwp_boost_min >= max_limit)
1581 		return;
1582 
1583 	if (!cpu->hwp_boost_min)
1584 		cpu->hwp_boost_min = min_limit;
1585 
1586 	/* level at half way mark between min and guranteed */
1587 	boost_level1 = (HWP_GUARANTEED_PERF(cpu->hwp_cap_cached) + min_limit) >> 1;
1588 
1589 	if (cpu->hwp_boost_min < boost_level1)
1590 		cpu->hwp_boost_min = boost_level1;
1591 	else if (cpu->hwp_boost_min < HWP_GUARANTEED_PERF(cpu->hwp_cap_cached))
1592 		cpu->hwp_boost_min = HWP_GUARANTEED_PERF(cpu->hwp_cap_cached);
1593 	else if (cpu->hwp_boost_min == HWP_GUARANTEED_PERF(cpu->hwp_cap_cached) &&
1594 		 max_limit != HWP_GUARANTEED_PERF(cpu->hwp_cap_cached))
1595 		cpu->hwp_boost_min = max_limit;
1596 	else
1597 		return;
1598 
1599 	hwp_req = (hwp_req & ~GENMASK_ULL(7, 0)) | cpu->hwp_boost_min;
1600 	wrmsrl(MSR_HWP_REQUEST, hwp_req);
1601 	cpu->last_update = cpu->sample.time;
1602 }
1603 
1604 static inline void intel_pstate_hwp_boost_down(struct cpudata *cpu)
1605 {
1606 	if (cpu->hwp_boost_min) {
1607 		bool expired;
1608 
1609 		/* Check if we are idle for hold time to boost down */
1610 		expired = time_after64(cpu->sample.time, cpu->last_update +
1611 				       hwp_boost_hold_time_ns);
1612 		if (expired) {
1613 			wrmsrl(MSR_HWP_REQUEST, cpu->hwp_req_cached);
1614 			cpu->hwp_boost_min = 0;
1615 		}
1616 	}
1617 	cpu->last_update = cpu->sample.time;
1618 }
1619 
1620 static inline void intel_pstate_update_util_hwp_local(struct cpudata *cpu,
1621 						      u64 time)
1622 {
1623 	cpu->sample.time = time;
1624 
1625 	if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) {
1626 		bool do_io = false;
1627 
1628 		cpu->sched_flags = 0;
1629 		/*
1630 		 * Set iowait_boost flag and update time. Since IO WAIT flag
1631 		 * is set all the time, we can't just conclude that there is
1632 		 * some IO bound activity is scheduled on this CPU with just
1633 		 * one occurrence. If we receive at least two in two
1634 		 * consecutive ticks, then we treat as boost candidate.
1635 		 */
1636 		if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC))
1637 			do_io = true;
1638 
1639 		cpu->last_io_update = time;
1640 
1641 		if (do_io)
1642 			intel_pstate_hwp_boost_up(cpu);
1643 
1644 	} else {
1645 		intel_pstate_hwp_boost_down(cpu);
1646 	}
1647 }
1648 
1649 static inline void intel_pstate_update_util_hwp(struct update_util_data *data,
1650 						u64 time, unsigned int flags)
1651 {
1652 	struct cpudata *cpu = container_of(data, struct cpudata, update_util);
1653 
1654 	cpu->sched_flags |= flags;
1655 
1656 	if (smp_processor_id() == cpu->cpu)
1657 		intel_pstate_update_util_hwp_local(cpu, time);
1658 }
1659 
1660 static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu)
1661 {
1662 	struct sample *sample = &cpu->sample;
1663 
1664 	sample->core_avg_perf = div_ext_fp(sample->aperf, sample->mperf);
1665 }
1666 
1667 static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time)
1668 {
1669 	u64 aperf, mperf;
1670 	unsigned long flags;
1671 	u64 tsc;
1672 
1673 	local_irq_save(flags);
1674 	rdmsrl(MSR_IA32_APERF, aperf);
1675 	rdmsrl(MSR_IA32_MPERF, mperf);
1676 	tsc = rdtsc();
1677 	if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) {
1678 		local_irq_restore(flags);
1679 		return false;
1680 	}
1681 	local_irq_restore(flags);
1682 
1683 	cpu->last_sample_time = cpu->sample.time;
1684 	cpu->sample.time = time;
1685 	cpu->sample.aperf = aperf;
1686 	cpu->sample.mperf = mperf;
1687 	cpu->sample.tsc =  tsc;
1688 	cpu->sample.aperf -= cpu->prev_aperf;
1689 	cpu->sample.mperf -= cpu->prev_mperf;
1690 	cpu->sample.tsc -= cpu->prev_tsc;
1691 
1692 	cpu->prev_aperf = aperf;
1693 	cpu->prev_mperf = mperf;
1694 	cpu->prev_tsc = tsc;
1695 	/*
1696 	 * First time this function is invoked in a given cycle, all of the
1697 	 * previous sample data fields are equal to zero or stale and they must
1698 	 * be populated with meaningful numbers for things to work, so assume
1699 	 * that sample.time will always be reset before setting the utilization
1700 	 * update hook and make the caller skip the sample then.
1701 	 */
1702 	if (cpu->last_sample_time) {
1703 		intel_pstate_calc_avg_perf(cpu);
1704 		return true;
1705 	}
1706 	return false;
1707 }
1708 
1709 static inline int32_t get_avg_frequency(struct cpudata *cpu)
1710 {
1711 	return mul_ext_fp(cpu->sample.core_avg_perf, cpu_khz);
1712 }
1713 
1714 static inline int32_t get_avg_pstate(struct cpudata *cpu)
1715 {
1716 	return mul_ext_fp(cpu->pstate.max_pstate_physical,
1717 			  cpu->sample.core_avg_perf);
1718 }
1719 
1720 static inline int32_t get_target_pstate(struct cpudata *cpu)
1721 {
1722 	struct sample *sample = &cpu->sample;
1723 	int32_t busy_frac;
1724 	int target, avg_pstate;
1725 
1726 	busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift,
1727 			   sample->tsc);
1728 
1729 	if (busy_frac < cpu->iowait_boost)
1730 		busy_frac = cpu->iowait_boost;
1731 
1732 	sample->busy_scaled = busy_frac * 100;
1733 
1734 	target = global.no_turbo || global.turbo_disabled ?
1735 			cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
1736 	target += target >> 2;
1737 	target = mul_fp(target, busy_frac);
1738 	if (target < cpu->pstate.min_pstate)
1739 		target = cpu->pstate.min_pstate;
1740 
1741 	/*
1742 	 * If the average P-state during the previous cycle was higher than the
1743 	 * current target, add 50% of the difference to the target to reduce
1744 	 * possible performance oscillations and offset possible performance
1745 	 * loss related to moving the workload from one CPU to another within
1746 	 * a package/module.
1747 	 */
1748 	avg_pstate = get_avg_pstate(cpu);
1749 	if (avg_pstate > target)
1750 		target += (avg_pstate - target) >> 1;
1751 
1752 	return target;
1753 }
1754 
1755 static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate)
1756 {
1757 	int min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio);
1758 	int max_pstate = max(min_pstate, cpu->max_perf_ratio);
1759 
1760 	return clamp_t(int, pstate, min_pstate, max_pstate);
1761 }
1762 
1763 static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
1764 {
1765 	if (pstate == cpu->pstate.current_pstate)
1766 		return;
1767 
1768 	cpu->pstate.current_pstate = pstate;
1769 	wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate));
1770 }
1771 
1772 static void intel_pstate_adjust_pstate(struct cpudata *cpu)
1773 {
1774 	int from = cpu->pstate.current_pstate;
1775 	struct sample *sample;
1776 	int target_pstate;
1777 
1778 	update_turbo_state();
1779 
1780 	target_pstate = get_target_pstate(cpu);
1781 	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
1782 	trace_cpu_frequency(target_pstate * cpu->pstate.scaling, cpu->cpu);
1783 	intel_pstate_update_pstate(cpu, target_pstate);
1784 
1785 	sample = &cpu->sample;
1786 	trace_pstate_sample(mul_ext_fp(100, sample->core_avg_perf),
1787 		fp_toint(sample->busy_scaled),
1788 		from,
1789 		cpu->pstate.current_pstate,
1790 		sample->mperf,
1791 		sample->aperf,
1792 		sample->tsc,
1793 		get_avg_frequency(cpu),
1794 		fp_toint(cpu->iowait_boost * 100));
1795 }
1796 
1797 static void intel_pstate_update_util(struct update_util_data *data, u64 time,
1798 				     unsigned int flags)
1799 {
1800 	struct cpudata *cpu = container_of(data, struct cpudata, update_util);
1801 	u64 delta_ns;
1802 
1803 	/* Don't allow remote callbacks */
1804 	if (smp_processor_id() != cpu->cpu)
1805 		return;
1806 
1807 	delta_ns = time - cpu->last_update;
1808 	if (flags & SCHED_CPUFREQ_IOWAIT) {
1809 		/* Start over if the CPU may have been idle. */
1810 		if (delta_ns > TICK_NSEC) {
1811 			cpu->iowait_boost = ONE_EIGHTH_FP;
1812 		} else if (cpu->iowait_boost >= ONE_EIGHTH_FP) {
1813 			cpu->iowait_boost <<= 1;
1814 			if (cpu->iowait_boost > int_tofp(1))
1815 				cpu->iowait_boost = int_tofp(1);
1816 		} else {
1817 			cpu->iowait_boost = ONE_EIGHTH_FP;
1818 		}
1819 	} else if (cpu->iowait_boost) {
1820 		/* Clear iowait_boost if the CPU may have been idle. */
1821 		if (delta_ns > TICK_NSEC)
1822 			cpu->iowait_boost = 0;
1823 		else
1824 			cpu->iowait_boost >>= 1;
1825 	}
1826 	cpu->last_update = time;
1827 	delta_ns = time - cpu->sample.time;
1828 	if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL)
1829 		return;
1830 
1831 	if (intel_pstate_sample(cpu, time))
1832 		intel_pstate_adjust_pstate(cpu);
1833 }
1834 
1835 static struct pstate_funcs core_funcs = {
1836 	.get_max = core_get_max_pstate,
1837 	.get_max_physical = core_get_max_pstate_physical,
1838 	.get_min = core_get_min_pstate,
1839 	.get_turbo = core_get_turbo_pstate,
1840 	.get_scaling = core_get_scaling,
1841 	.get_val = core_get_val,
1842 };
1843 
1844 static const struct pstate_funcs silvermont_funcs = {
1845 	.get_max = atom_get_max_pstate,
1846 	.get_max_physical = atom_get_max_pstate,
1847 	.get_min = atom_get_min_pstate,
1848 	.get_turbo = atom_get_turbo_pstate,
1849 	.get_val = atom_get_val,
1850 	.get_scaling = silvermont_get_scaling,
1851 	.get_vid = atom_get_vid,
1852 };
1853 
1854 static const struct pstate_funcs airmont_funcs = {
1855 	.get_max = atom_get_max_pstate,
1856 	.get_max_physical = atom_get_max_pstate,
1857 	.get_min = atom_get_min_pstate,
1858 	.get_turbo = atom_get_turbo_pstate,
1859 	.get_val = atom_get_val,
1860 	.get_scaling = airmont_get_scaling,
1861 	.get_vid = atom_get_vid,
1862 };
1863 
1864 static const struct pstate_funcs knl_funcs = {
1865 	.get_max = core_get_max_pstate,
1866 	.get_max_physical = core_get_max_pstate_physical,
1867 	.get_min = core_get_min_pstate,
1868 	.get_turbo = knl_get_turbo_pstate,
1869 	.get_aperf_mperf_shift = knl_get_aperf_mperf_shift,
1870 	.get_scaling = core_get_scaling,
1871 	.get_val = core_get_val,
1872 };
1873 
1874 #define ICPU(model, policy) \
1875 	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\
1876 			(unsigned long)&policy }
1877 
1878 static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
1879 	ICPU(INTEL_FAM6_SANDYBRIDGE, 		core_funcs),
1880 	ICPU(INTEL_FAM6_SANDYBRIDGE_X,		core_funcs),
1881 	ICPU(INTEL_FAM6_ATOM_SILVERMONT,	silvermont_funcs),
1882 	ICPU(INTEL_FAM6_IVYBRIDGE,		core_funcs),
1883 	ICPU(INTEL_FAM6_HASWELL_CORE,		core_funcs),
1884 	ICPU(INTEL_FAM6_BROADWELL_CORE,		core_funcs),
1885 	ICPU(INTEL_FAM6_IVYBRIDGE_X,		core_funcs),
1886 	ICPU(INTEL_FAM6_HASWELL_X,		core_funcs),
1887 	ICPU(INTEL_FAM6_HASWELL_ULT,		core_funcs),
1888 	ICPU(INTEL_FAM6_HASWELL_GT3E,		core_funcs),
1889 	ICPU(INTEL_FAM6_BROADWELL_GT3E,		core_funcs),
1890 	ICPU(INTEL_FAM6_ATOM_AIRMONT,		airmont_funcs),
1891 	ICPU(INTEL_FAM6_SKYLAKE_MOBILE,		core_funcs),
1892 	ICPU(INTEL_FAM6_BROADWELL_X,		core_funcs),
1893 	ICPU(INTEL_FAM6_SKYLAKE_DESKTOP,	core_funcs),
1894 	ICPU(INTEL_FAM6_BROADWELL_XEON_D,	core_funcs),
1895 	ICPU(INTEL_FAM6_XEON_PHI_KNL,		knl_funcs),
1896 	ICPU(INTEL_FAM6_XEON_PHI_KNM,		knl_funcs),
1897 	ICPU(INTEL_FAM6_ATOM_GOLDMONT,		core_funcs),
1898 	ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS,     core_funcs),
1899 	ICPU(INTEL_FAM6_SKYLAKE_X,		core_funcs),
1900 	{}
1901 };
1902 MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);
1903 
1904 static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = {
1905 	ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_funcs),
1906 	ICPU(INTEL_FAM6_BROADWELL_X, core_funcs),
1907 	ICPU(INTEL_FAM6_SKYLAKE_X, core_funcs),
1908 	{}
1909 };
1910 
1911 static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = {
1912 	ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, core_funcs),
1913 	{}
1914 };
1915 
1916 static const struct x86_cpu_id intel_pstate_hwp_boost_ids[] = {
1917 	ICPU(INTEL_FAM6_SKYLAKE_X, core_funcs),
1918 	ICPU(INTEL_FAM6_SKYLAKE_DESKTOP, core_funcs),
1919 	{}
1920 };
1921 
1922 static int intel_pstate_init_cpu(unsigned int cpunum)
1923 {
1924 	struct cpudata *cpu;
1925 
1926 	cpu = all_cpu_data[cpunum];
1927 
1928 	if (!cpu) {
1929 		cpu = kzalloc(sizeof(*cpu), GFP_KERNEL);
1930 		if (!cpu)
1931 			return -ENOMEM;
1932 
1933 		all_cpu_data[cpunum] = cpu;
1934 
1935 		cpu->epp_default = -EINVAL;
1936 		cpu->epp_powersave = -EINVAL;
1937 		cpu->epp_saved = -EINVAL;
1938 	}
1939 
1940 	cpu = all_cpu_data[cpunum];
1941 
1942 	cpu->cpu = cpunum;
1943 
1944 	if (hwp_active) {
1945 		const struct x86_cpu_id *id;
1946 
1947 		id = x86_match_cpu(intel_pstate_cpu_ee_disable_ids);
1948 		if (id)
1949 			intel_pstate_disable_ee(cpunum);
1950 
1951 		intel_pstate_hwp_enable(cpu);
1952 
1953 		id = x86_match_cpu(intel_pstate_hwp_boost_ids);
1954 		if (id && intel_pstate_acpi_pm_profile_server())
1955 			hwp_boost = true;
1956 	}
1957 
1958 	intel_pstate_get_cpu_pstates(cpu);
1959 
1960 	pr_debug("controlling: cpu %d\n", cpunum);
1961 
1962 	return 0;
1963 }
1964 
1965 static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
1966 {
1967 	struct cpudata *cpu = all_cpu_data[cpu_num];
1968 
1969 	if (hwp_active && !hwp_boost)
1970 		return;
1971 
1972 	if (cpu->update_util_set)
1973 		return;
1974 
1975 	/* Prevent intel_pstate_update_util() from using stale data. */
1976 	cpu->sample.time = 0;
1977 	cpufreq_add_update_util_hook(cpu_num, &cpu->update_util,
1978 				     (hwp_active ?
1979 				      intel_pstate_update_util_hwp :
1980 				      intel_pstate_update_util));
1981 	cpu->update_util_set = true;
1982 }
1983 
1984 static void intel_pstate_clear_update_util_hook(unsigned int cpu)
1985 {
1986 	struct cpudata *cpu_data = all_cpu_data[cpu];
1987 
1988 	if (!cpu_data->update_util_set)
1989 		return;
1990 
1991 	cpufreq_remove_update_util_hook(cpu);
1992 	cpu_data->update_util_set = false;
1993 	synchronize_rcu();
1994 }
1995 
1996 static int intel_pstate_get_max_freq(struct cpudata *cpu)
1997 {
1998 	return global.turbo_disabled || global.no_turbo ?
1999 			cpu->pstate.max_freq : cpu->pstate.turbo_freq;
2000 }
2001 
2002 static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
2003 					    struct cpudata *cpu)
2004 {
2005 	int max_freq = intel_pstate_get_max_freq(cpu);
2006 	int32_t max_policy_perf, min_policy_perf;
2007 	int max_state, turbo_max;
2008 
2009 	/*
2010 	 * HWP needs some special consideration, because on BDX the
2011 	 * HWP_REQUEST uses abstract value to represent performance
2012 	 * rather than pure ratios.
2013 	 */
2014 	if (hwp_active) {
2015 		intel_pstate_get_hwp_max(cpu->cpu, &turbo_max, &max_state);
2016 	} else {
2017 		max_state = global.no_turbo || global.turbo_disabled ?
2018 			cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
2019 		turbo_max = cpu->pstate.turbo_pstate;
2020 	}
2021 
2022 	max_policy_perf = max_state * policy->max / max_freq;
2023 	if (policy->max == policy->min) {
2024 		min_policy_perf = max_policy_perf;
2025 	} else {
2026 		min_policy_perf = max_state * policy->min / max_freq;
2027 		min_policy_perf = clamp_t(int32_t, min_policy_perf,
2028 					  0, max_policy_perf);
2029 	}
2030 
2031 	pr_debug("cpu:%d max_state %d min_policy_perf:%d max_policy_perf:%d\n",
2032 		 policy->cpu, max_state,
2033 		 min_policy_perf, max_policy_perf);
2034 
2035 	/* Normalize user input to [min_perf, max_perf] */
2036 	if (per_cpu_limits) {
2037 		cpu->min_perf_ratio = min_policy_perf;
2038 		cpu->max_perf_ratio = max_policy_perf;
2039 	} else {
2040 		int32_t global_min, global_max;
2041 
2042 		/* Global limits are in percent of the maximum turbo P-state. */
2043 		global_max = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100);
2044 		global_min = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100);
2045 		global_min = clamp_t(int32_t, global_min, 0, global_max);
2046 
2047 		pr_debug("cpu:%d global_min:%d global_max:%d\n", policy->cpu,
2048 			 global_min, global_max);
2049 
2050 		cpu->min_perf_ratio = max(min_policy_perf, global_min);
2051 		cpu->min_perf_ratio = min(cpu->min_perf_ratio, max_policy_perf);
2052 		cpu->max_perf_ratio = min(max_policy_perf, global_max);
2053 		cpu->max_perf_ratio = max(min_policy_perf, cpu->max_perf_ratio);
2054 
2055 		/* Make sure min_perf <= max_perf */
2056 		cpu->min_perf_ratio = min(cpu->min_perf_ratio,
2057 					  cpu->max_perf_ratio);
2058 
2059 	}
2060 	pr_debug("cpu:%d max_perf_ratio:%d min_perf_ratio:%d\n", policy->cpu,
2061 		 cpu->max_perf_ratio,
2062 		 cpu->min_perf_ratio);
2063 }
2064 
2065 static int intel_pstate_set_policy(struct cpufreq_policy *policy)
2066 {
2067 	struct cpudata *cpu;
2068 
2069 	if (!policy->cpuinfo.max_freq)
2070 		return -ENODEV;
2071 
2072 	pr_debug("set_policy cpuinfo.max %u policy->max %u\n",
2073 		 policy->cpuinfo.max_freq, policy->max);
2074 
2075 	cpu = all_cpu_data[policy->cpu];
2076 	cpu->policy = policy->policy;
2077 
2078 	mutex_lock(&intel_pstate_limits_lock);
2079 
2080 	intel_pstate_update_perf_limits(policy, cpu);
2081 
2082 	if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) {
2083 		/*
2084 		 * NOHZ_FULL CPUs need this as the governor callback may not
2085 		 * be invoked on them.
2086 		 */
2087 		intel_pstate_clear_update_util_hook(policy->cpu);
2088 		intel_pstate_max_within_limits(cpu);
2089 	} else {
2090 		intel_pstate_set_update_util_hook(policy->cpu);
2091 	}
2092 
2093 	if (hwp_active) {
2094 		/*
2095 		 * When hwp_boost was active before and dynamically it
2096 		 * was turned off, in that case we need to clear the
2097 		 * update util hook.
2098 		 */
2099 		if (!hwp_boost)
2100 			intel_pstate_clear_update_util_hook(policy->cpu);
2101 		intel_pstate_hwp_set(policy->cpu);
2102 	}
2103 
2104 	mutex_unlock(&intel_pstate_limits_lock);
2105 
2106 	return 0;
2107 }
2108 
2109 static void intel_pstate_adjust_policy_max(struct cpufreq_policy *policy,
2110 					 struct cpudata *cpu)
2111 {
2112 	if (!hwp_active &&
2113 	    cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate &&
2114 	    policy->max < policy->cpuinfo.max_freq &&
2115 	    policy->max > cpu->pstate.max_freq) {
2116 		pr_debug("policy->max > max non turbo frequency\n");
2117 		policy->max = policy->cpuinfo.max_freq;
2118 	}
2119 }
2120 
2121 static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
2122 {
2123 	struct cpudata *cpu = all_cpu_data[policy->cpu];
2124 
2125 	update_turbo_state();
2126 	cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
2127 				     intel_pstate_get_max_freq(cpu));
2128 
2129 	if (policy->policy != CPUFREQ_POLICY_POWERSAVE &&
2130 	    policy->policy != CPUFREQ_POLICY_PERFORMANCE)
2131 		return -EINVAL;
2132 
2133 	intel_pstate_adjust_policy_max(policy, cpu);
2134 
2135 	return 0;
2136 }
2137 
2138 static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
2139 {
2140 	intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
2141 }
2142 
2143 static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
2144 {
2145 	pr_debug("CPU %d exiting\n", policy->cpu);
2146 
2147 	intel_pstate_clear_update_util_hook(policy->cpu);
2148 	if (hwp_active) {
2149 		intel_pstate_hwp_save_state(policy);
2150 		intel_pstate_hwp_force_min_perf(policy->cpu);
2151 	} else {
2152 		intel_cpufreq_stop_cpu(policy);
2153 	}
2154 }
2155 
2156 static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
2157 {
2158 	intel_pstate_exit_perf_limits(policy);
2159 
2160 	policy->fast_switch_possible = false;
2161 
2162 	return 0;
2163 }
2164 
2165 static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
2166 {
2167 	struct cpudata *cpu;
2168 	int rc;
2169 
2170 	rc = intel_pstate_init_cpu(policy->cpu);
2171 	if (rc)
2172 		return rc;
2173 
2174 	cpu = all_cpu_data[policy->cpu];
2175 
2176 	cpu->max_perf_ratio = 0xFF;
2177 	cpu->min_perf_ratio = 0;
2178 
2179 	policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
2180 	policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
2181 
2182 	/* cpuinfo and default policy values */
2183 	policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
2184 	update_turbo_state();
2185 	global.turbo_disabled_mf = global.turbo_disabled;
2186 	policy->cpuinfo.max_freq = global.turbo_disabled ?
2187 			cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
2188 	policy->cpuinfo.max_freq *= cpu->pstate.scaling;
2189 
2190 	if (hwp_active) {
2191 		unsigned int max_freq;
2192 
2193 		max_freq = global.turbo_disabled ?
2194 			cpu->pstate.max_freq : cpu->pstate.turbo_freq;
2195 		if (max_freq < policy->cpuinfo.max_freq)
2196 			policy->cpuinfo.max_freq = max_freq;
2197 	}
2198 
2199 	intel_pstate_init_acpi_perf_limits(policy);
2200 
2201 	policy->fast_switch_possible = true;
2202 
2203 	return 0;
2204 }
2205 
2206 static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
2207 {
2208 	int ret = __intel_pstate_cpu_init(policy);
2209 
2210 	if (ret)
2211 		return ret;
2212 
2213 	if (IS_ENABLED(CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE))
2214 		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
2215 	else
2216 		policy->policy = CPUFREQ_POLICY_POWERSAVE;
2217 
2218 	return 0;
2219 }
2220 
2221 static struct cpufreq_driver intel_pstate = {
2222 	.flags		= CPUFREQ_CONST_LOOPS,
2223 	.verify		= intel_pstate_verify_policy,
2224 	.setpolicy	= intel_pstate_set_policy,
2225 	.suspend	= intel_pstate_hwp_save_state,
2226 	.resume		= intel_pstate_resume,
2227 	.init		= intel_pstate_cpu_init,
2228 	.exit		= intel_pstate_cpu_exit,
2229 	.stop_cpu	= intel_pstate_stop_cpu,
2230 	.update_limits	= intel_pstate_update_limits,
2231 	.name		= "intel_pstate",
2232 };
2233 
2234 static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy)
2235 {
2236 	struct cpudata *cpu = all_cpu_data[policy->cpu];
2237 
2238 	update_turbo_state();
2239 	cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
2240 				     intel_pstate_get_max_freq(cpu));
2241 
2242 	intel_pstate_adjust_policy_max(policy, cpu);
2243 
2244 	intel_pstate_update_perf_limits(policy, cpu);
2245 
2246 	return 0;
2247 }
2248 
2249 /* Use of trace in passive mode:
2250  *
2251  * In passive mode the trace core_busy field (also known as the
2252  * performance field, and lablelled as such on the graphs; also known as
2253  * core_avg_perf) is not needed and so is re-assigned to indicate if the
2254  * driver call was via the normal or fast switch path. Various graphs
2255  * output from the intel_pstate_tracer.py utility that include core_busy
2256  * (or performance or core_avg_perf) have a fixed y-axis from 0 to 100%,
2257  * so we use 10 to indicate the the normal path through the driver, and
2258  * 90 to indicate the fast switch path through the driver.
2259  * The scaled_busy field is not used, and is set to 0.
2260  */
2261 
2262 #define	INTEL_PSTATE_TRACE_TARGET 10
2263 #define	INTEL_PSTATE_TRACE_FAST_SWITCH 90
2264 
2265 static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate)
2266 {
2267 	struct sample *sample;
2268 
2269 	if (!trace_pstate_sample_enabled())
2270 		return;
2271 
2272 	if (!intel_pstate_sample(cpu, ktime_get()))
2273 		return;
2274 
2275 	sample = &cpu->sample;
2276 	trace_pstate_sample(trace_type,
2277 		0,
2278 		old_pstate,
2279 		cpu->pstate.current_pstate,
2280 		sample->mperf,
2281 		sample->aperf,
2282 		sample->tsc,
2283 		get_avg_frequency(cpu),
2284 		fp_toint(cpu->iowait_boost * 100));
2285 }
2286 
2287 static int intel_cpufreq_target(struct cpufreq_policy *policy,
2288 				unsigned int target_freq,
2289 				unsigned int relation)
2290 {
2291 	struct cpudata *cpu = all_cpu_data[policy->cpu];
2292 	struct cpufreq_freqs freqs;
2293 	int target_pstate, old_pstate;
2294 
2295 	update_turbo_state();
2296 
2297 	freqs.old = policy->cur;
2298 	freqs.new = target_freq;
2299 
2300 	cpufreq_freq_transition_begin(policy, &freqs);
2301 	switch (relation) {
2302 	case CPUFREQ_RELATION_L:
2303 		target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
2304 		break;
2305 	case CPUFREQ_RELATION_H:
2306 		target_pstate = freqs.new / cpu->pstate.scaling;
2307 		break;
2308 	default:
2309 		target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
2310 		break;
2311 	}
2312 	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
2313 	old_pstate = cpu->pstate.current_pstate;
2314 	if (target_pstate != cpu->pstate.current_pstate) {
2315 		cpu->pstate.current_pstate = target_pstate;
2316 		wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL,
2317 			      pstate_funcs.get_val(cpu, target_pstate));
2318 	}
2319 	freqs.new = target_pstate * cpu->pstate.scaling;
2320 	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate);
2321 	cpufreq_freq_transition_end(policy, &freqs, false);
2322 
2323 	return 0;
2324 }
2325 
2326 static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
2327 					      unsigned int target_freq)
2328 {
2329 	struct cpudata *cpu = all_cpu_data[policy->cpu];
2330 	int target_pstate, old_pstate;
2331 
2332 	update_turbo_state();
2333 
2334 	target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
2335 	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
2336 	old_pstate = cpu->pstate.current_pstate;
2337 	intel_pstate_update_pstate(cpu, target_pstate);
2338 	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate);
2339 	return target_pstate * cpu->pstate.scaling;
2340 }
2341 
2342 static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
2343 {
2344 	int ret = __intel_pstate_cpu_init(policy);
2345 
2346 	if (ret)
2347 		return ret;
2348 
2349 	policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
2350 	policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
2351 	/* This reflects the intel_pstate_get_cpu_pstates() setting. */
2352 	policy->cur = policy->cpuinfo.min_freq;
2353 
2354 	return 0;
2355 }
2356 
2357 static struct cpufreq_driver intel_cpufreq = {
2358 	.flags		= CPUFREQ_CONST_LOOPS,
2359 	.verify		= intel_cpufreq_verify_policy,
2360 	.target		= intel_cpufreq_target,
2361 	.fast_switch	= intel_cpufreq_fast_switch,
2362 	.init		= intel_cpufreq_cpu_init,
2363 	.exit		= intel_pstate_cpu_exit,
2364 	.stop_cpu	= intel_cpufreq_stop_cpu,
2365 	.update_limits	= intel_pstate_update_limits,
2366 	.name		= "intel_cpufreq",
2367 };
2368 
2369 static struct cpufreq_driver *default_driver = &intel_pstate;
2370 
2371 static void intel_pstate_driver_cleanup(void)
2372 {
2373 	unsigned int cpu;
2374 
2375 	get_online_cpus();
2376 	for_each_online_cpu(cpu) {
2377 		if (all_cpu_data[cpu]) {
2378 			if (intel_pstate_driver == &intel_pstate)
2379 				intel_pstate_clear_update_util_hook(cpu);
2380 
2381 			kfree(all_cpu_data[cpu]);
2382 			all_cpu_data[cpu] = NULL;
2383 		}
2384 	}
2385 	put_online_cpus();
2386 	intel_pstate_driver = NULL;
2387 }
2388 
2389 static int intel_pstate_register_driver(struct cpufreq_driver *driver)
2390 {
2391 	int ret;
2392 
2393 	memset(&global, 0, sizeof(global));
2394 	global.max_perf_pct = 100;
2395 
2396 	intel_pstate_driver = driver;
2397 	ret = cpufreq_register_driver(intel_pstate_driver);
2398 	if (ret) {
2399 		intel_pstate_driver_cleanup();
2400 		return ret;
2401 	}
2402 
2403 	global.min_perf_pct = min_perf_pct_min();
2404 
2405 	return 0;
2406 }
2407 
2408 static int intel_pstate_unregister_driver(void)
2409 {
2410 	if (hwp_active)
2411 		return -EBUSY;
2412 
2413 	cpufreq_unregister_driver(intel_pstate_driver);
2414 	intel_pstate_driver_cleanup();
2415 
2416 	return 0;
2417 }
2418 
2419 static ssize_t intel_pstate_show_status(char *buf)
2420 {
2421 	if (!intel_pstate_driver)
2422 		return sprintf(buf, "off\n");
2423 
2424 	return sprintf(buf, "%s\n", intel_pstate_driver == &intel_pstate ?
2425 					"active" : "passive");
2426 }
2427 
2428 static int intel_pstate_update_status(const char *buf, size_t size)
2429 {
2430 	int ret;
2431 
2432 	if (size == 3 && !strncmp(buf, "off", size))
2433 		return intel_pstate_driver ?
2434 			intel_pstate_unregister_driver() : -EINVAL;
2435 
2436 	if (size == 6 && !strncmp(buf, "active", size)) {
2437 		if (intel_pstate_driver) {
2438 			if (intel_pstate_driver == &intel_pstate)
2439 				return 0;
2440 
2441 			ret = intel_pstate_unregister_driver();
2442 			if (ret)
2443 				return ret;
2444 		}
2445 
2446 		return intel_pstate_register_driver(&intel_pstate);
2447 	}
2448 
2449 	if (size == 7 && !strncmp(buf, "passive", size)) {
2450 		if (intel_pstate_driver) {
2451 			if (intel_pstate_driver == &intel_cpufreq)
2452 				return 0;
2453 
2454 			ret = intel_pstate_unregister_driver();
2455 			if (ret)
2456 				return ret;
2457 		}
2458 
2459 		return intel_pstate_register_driver(&intel_cpufreq);
2460 	}
2461 
2462 	return -EINVAL;
2463 }
2464 
2465 static int no_load __initdata;
2466 static int no_hwp __initdata;
2467 static int hwp_only __initdata;
2468 static unsigned int force_load __initdata;
2469 
2470 static int __init intel_pstate_msrs_not_valid(void)
2471 {
2472 	if (!pstate_funcs.get_max() ||
2473 	    !pstate_funcs.get_min() ||
2474 	    !pstate_funcs.get_turbo())
2475 		return -ENODEV;
2476 
2477 	return 0;
2478 }
2479 
2480 static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
2481 {
2482 	pstate_funcs.get_max   = funcs->get_max;
2483 	pstate_funcs.get_max_physical = funcs->get_max_physical;
2484 	pstate_funcs.get_min   = funcs->get_min;
2485 	pstate_funcs.get_turbo = funcs->get_turbo;
2486 	pstate_funcs.get_scaling = funcs->get_scaling;
2487 	pstate_funcs.get_val   = funcs->get_val;
2488 	pstate_funcs.get_vid   = funcs->get_vid;
2489 	pstate_funcs.get_aperf_mperf_shift = funcs->get_aperf_mperf_shift;
2490 }
2491 
2492 #ifdef CONFIG_ACPI
2493 
2494 static bool __init intel_pstate_no_acpi_pss(void)
2495 {
2496 	int i;
2497 
2498 	for_each_possible_cpu(i) {
2499 		acpi_status status;
2500 		union acpi_object *pss;
2501 		struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
2502 		struct acpi_processor *pr = per_cpu(processors, i);
2503 
2504 		if (!pr)
2505 			continue;
2506 
2507 		status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer);
2508 		if (ACPI_FAILURE(status))
2509 			continue;
2510 
2511 		pss = buffer.pointer;
2512 		if (pss && pss->type == ACPI_TYPE_PACKAGE) {
2513 			kfree(pss);
2514 			return false;
2515 		}
2516 
2517 		kfree(pss);
2518 	}
2519 
2520 	pr_debug("ACPI _PSS not found\n");
2521 	return true;
2522 }
2523 
2524 static bool __init intel_pstate_no_acpi_pcch(void)
2525 {
2526 	acpi_status status;
2527 	acpi_handle handle;
2528 
2529 	status = acpi_get_handle(NULL, "\\_SB", &handle);
2530 	if (ACPI_FAILURE(status))
2531 		goto not_found;
2532 
2533 	if (acpi_has_method(handle, "PCCH"))
2534 		return false;
2535 
2536 not_found:
2537 	pr_debug("ACPI PCCH not found\n");
2538 	return true;
2539 }
2540 
2541 static bool __init intel_pstate_has_acpi_ppc(void)
2542 {
2543 	int i;
2544 
2545 	for_each_possible_cpu(i) {
2546 		struct acpi_processor *pr = per_cpu(processors, i);
2547 
2548 		if (!pr)
2549 			continue;
2550 		if (acpi_has_method(pr->handle, "_PPC"))
2551 			return true;
2552 	}
2553 	pr_debug("ACPI _PPC not found\n");
2554 	return false;
2555 }
2556 
2557 enum {
2558 	PSS,
2559 	PPC,
2560 };
2561 
2562 /* Hardware vendor-specific info that has its own power management modes */
2563 static struct acpi_platform_list plat_info[] __initdata = {
2564 	{"HP    ", "ProLiant", 0, ACPI_SIG_FADT, all_versions, 0, PSS},
2565 	{"ORACLE", "X4-2    ", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2566 	{"ORACLE", "X4-2L   ", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2567 	{"ORACLE", "X4-2B   ", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2568 	{"ORACLE", "X3-2    ", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2569 	{"ORACLE", "X3-2L   ", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2570 	{"ORACLE", "X3-2B   ", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2571 	{"ORACLE", "X4470M2 ", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2572 	{"ORACLE", "X4270M3 ", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2573 	{"ORACLE", "X4270M2 ", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2574 	{"ORACLE", "X4170M2 ", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2575 	{"ORACLE", "X4170 M3", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2576 	{"ORACLE", "X4275 M3", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2577 	{"ORACLE", "X6-2    ", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2578 	{"ORACLE", "Sudbury ", 0, ACPI_SIG_FADT, all_versions, 0, PPC},
2579 	{ } /* End */
2580 };
2581 
2582 static bool __init intel_pstate_platform_pwr_mgmt_exists(void)
2583 {
2584 	const struct x86_cpu_id *id;
2585 	u64 misc_pwr;
2586 	int idx;
2587 
2588 	id = x86_match_cpu(intel_pstate_cpu_oob_ids);
2589 	if (id) {
2590 		rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr);
2591 		if (misc_pwr & (1 << 8)) {
2592 			pr_debug("Bit 8 in the MISC_PWR_MGMT MSR set\n");
2593 			return true;
2594 		}
2595 	}
2596 
2597 	idx = acpi_match_platform_list(plat_info);
2598 	if (idx < 0)
2599 		return false;
2600 
2601 	switch (plat_info[idx].data) {
2602 	case PSS:
2603 		if (!intel_pstate_no_acpi_pss())
2604 			return false;
2605 
2606 		return intel_pstate_no_acpi_pcch();
2607 	case PPC:
2608 		return intel_pstate_has_acpi_ppc() && !force_load;
2609 	}
2610 
2611 	return false;
2612 }
2613 
2614 static void intel_pstate_request_control_from_smm(void)
2615 {
2616 	/*
2617 	 * It may be unsafe to request P-states control from SMM if _PPC support
2618 	 * has not been enabled.
2619 	 */
2620 	if (acpi_ppc)
2621 		acpi_processor_pstate_control();
2622 }
2623 #else /* CONFIG_ACPI not enabled */
2624 static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
2625 static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
2626 static inline void intel_pstate_request_control_from_smm(void) {}
2627 #endif /* CONFIG_ACPI */
2628 
2629 #define INTEL_PSTATE_HWP_BROADWELL	0x01
2630 
2631 #define ICPU_HWP(model, hwp_mode) \
2632 	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_HWP, hwp_mode }
2633 
2634 static const struct x86_cpu_id hwp_support_ids[] __initconst = {
2635 	ICPU_HWP(INTEL_FAM6_BROADWELL_X, INTEL_PSTATE_HWP_BROADWELL),
2636 	ICPU_HWP(INTEL_FAM6_BROADWELL_XEON_D, INTEL_PSTATE_HWP_BROADWELL),
2637 	ICPU_HWP(X86_MODEL_ANY, 0),
2638 	{}
2639 };
2640 
2641 static int __init intel_pstate_init(void)
2642 {
2643 	const struct x86_cpu_id *id;
2644 	int rc;
2645 
2646 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
2647 		return -ENODEV;
2648 
2649 	if (no_load)
2650 		return -ENODEV;
2651 
2652 	id = x86_match_cpu(hwp_support_ids);
2653 	if (id) {
2654 		copy_cpu_funcs(&core_funcs);
2655 		if (!no_hwp) {
2656 			hwp_active++;
2657 			hwp_mode_bdw = id->driver_data;
2658 			intel_pstate.attr = hwp_cpufreq_attrs;
2659 			goto hwp_cpu_matched;
2660 		}
2661 	} else {
2662 		id = x86_match_cpu(intel_pstate_cpu_ids);
2663 		if (!id) {
2664 			pr_info("CPU model not supported\n");
2665 			return -ENODEV;
2666 		}
2667 
2668 		copy_cpu_funcs((struct pstate_funcs *)id->driver_data);
2669 	}
2670 
2671 	if (intel_pstate_msrs_not_valid()) {
2672 		pr_info("Invalid MSRs\n");
2673 		return -ENODEV;
2674 	}
2675 
2676 hwp_cpu_matched:
2677 	/*
2678 	 * The Intel pstate driver will be ignored if the platform
2679 	 * firmware has its own power management modes.
2680 	 */
2681 	if (intel_pstate_platform_pwr_mgmt_exists()) {
2682 		pr_info("P-states controlled by the platform\n");
2683 		return -ENODEV;
2684 	}
2685 
2686 	if (!hwp_active && hwp_only)
2687 		return -ENOTSUPP;
2688 
2689 	pr_info("Intel P-state driver initializing\n");
2690 
2691 	all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus()));
2692 	if (!all_cpu_data)
2693 		return -ENOMEM;
2694 
2695 	intel_pstate_request_control_from_smm();
2696 
2697 	intel_pstate_sysfs_expose_params();
2698 
2699 	mutex_lock(&intel_pstate_driver_lock);
2700 	rc = intel_pstate_register_driver(default_driver);
2701 	mutex_unlock(&intel_pstate_driver_lock);
2702 	if (rc)
2703 		return rc;
2704 
2705 	if (hwp_active)
2706 		pr_info("HWP enabled\n");
2707 
2708 	return 0;
2709 }
2710 device_initcall(intel_pstate_init);
2711 
2712 static int __init intel_pstate_setup(char *str)
2713 {
2714 	if (!str)
2715 		return -EINVAL;
2716 
2717 	if (!strcmp(str, "disable")) {
2718 		no_load = 1;
2719 	} else if (!strcmp(str, "passive")) {
2720 		pr_info("Passive mode enabled\n");
2721 		default_driver = &intel_cpufreq;
2722 		no_hwp = 1;
2723 	}
2724 	if (!strcmp(str, "no_hwp")) {
2725 		pr_info("HWP disabled\n");
2726 		no_hwp = 1;
2727 	}
2728 	if (!strcmp(str, "force"))
2729 		force_load = 1;
2730 	if (!strcmp(str, "hwp_only"))
2731 		hwp_only = 1;
2732 	if (!strcmp(str, "per_cpu_perf_limits"))
2733 		per_cpu_limits = true;
2734 
2735 #ifdef CONFIG_ACPI
2736 	if (!strcmp(str, "support_acpi_ppc"))
2737 		acpi_ppc = true;
2738 #endif
2739 
2740 	return 0;
2741 }
2742 early_param("intel_pstate", intel_pstate_setup);
2743 
2744 MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>");
2745 MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors");
2746 MODULE_LICENSE("GPL");
2747