1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved
4  */
5 
6 #include <linux/cpu.h>
7 #include <linux/cpufreq.h>
8 #include <linux/delay.h>
9 #include <linux/dma-mapping.h>
10 #include <linux/module.h>
11 #include <linux/of.h>
12 #include <linux/of_platform.h>
13 #include <linux/platform_device.h>
14 #include <linux/slab.h>
15 
16 #include <asm/smp_plat.h>
17 
18 #include <soc/tegra/bpmp.h>
19 #include <soc/tegra/bpmp-abi.h>
20 
21 #define KHZ                     1000
22 #define REF_CLK_MHZ             408 /* 408 MHz */
23 #define US_DELAY                500
24 #define CPUFREQ_TBL_STEP_HZ     (50 * KHZ * KHZ)
25 #define MAX_CNT                 ~0U
26 
27 /* cpufreq transisition latency */
28 #define TEGRA_CPUFREQ_TRANSITION_LATENCY (300 * 1000) /* unit in nanoseconds */
29 
30 enum cluster {
31 	CLUSTER0,
32 	CLUSTER1,
33 	CLUSTER2,
34 	CLUSTER3,
35 	MAX_CLUSTERS,
36 };
37 
38 struct tegra194_cpufreq_data {
39 	void __iomem *regs;
40 	size_t num_clusters;
41 	struct cpufreq_frequency_table **tables;
42 };
43 
44 struct tegra_cpu_ctr {
45 	u32 cpu;
46 	u32 coreclk_cnt, last_coreclk_cnt;
47 	u32 refclk_cnt, last_refclk_cnt;
48 };
49 
50 struct read_counters_work {
51 	struct work_struct work;
52 	struct tegra_cpu_ctr c;
53 };
54 
55 static struct workqueue_struct *read_counters_wq;
56 
57 static void get_cpu_cluster(void *cluster)
58 {
59 	u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
60 
61 	*((uint32_t *)cluster) = MPIDR_AFFINITY_LEVEL(mpidr, 1);
62 }
63 
64 /*
65  * Read per-core Read-only system register NVFREQ_FEEDBACK_EL1.
66  * The register provides frequency feedback information to
67  * determine the average actual frequency a core has run at over
68  * a period of time.
69  *	[31:0] PLLP counter: Counts at fixed frequency (408 MHz)
70  *	[63:32] Core clock counter: counts on every core clock cycle
71  *			where the core is architecturally clocking
72  */
73 static u64 read_freq_feedback(void)
74 {
75 	u64 val = 0;
76 
77 	asm volatile("mrs %0, s3_0_c15_c0_5" : "=r" (val) : );
78 
79 	return val;
80 }
81 
82 static inline u32 map_ndiv_to_freq(struct mrq_cpu_ndiv_limits_response
83 				   *nltbl, u16 ndiv)
84 {
85 	return nltbl->ref_clk_hz / KHZ * ndiv / (nltbl->pdiv * nltbl->mdiv);
86 }
87 
88 static void tegra_read_counters(struct work_struct *work)
89 {
90 	struct read_counters_work *read_counters_work;
91 	struct tegra_cpu_ctr *c;
92 	u64 val;
93 
94 	/*
95 	 * ref_clk_counter(32 bit counter) runs on constant clk,
96 	 * pll_p(408MHz).
97 	 * It will take = 2 ^ 32 / 408 MHz to overflow ref clk counter
98 	 *              = 10526880 usec = 10.527 sec to overflow
99 	 *
100 	 * Like wise core_clk_counter(32 bit counter) runs on core clock.
101 	 * It's synchronized to crab_clk (cpu_crab_clk) which runs at
102 	 * freq of cluster. Assuming max cluster clock ~2000MHz,
103 	 * It will take = 2 ^ 32 / 2000 MHz to overflow core clk counter
104 	 *              = ~2.147 sec to overflow
105 	 */
106 	read_counters_work = container_of(work, struct read_counters_work,
107 					  work);
108 	c = &read_counters_work->c;
109 
110 	val = read_freq_feedback();
111 	c->last_refclk_cnt = lower_32_bits(val);
112 	c->last_coreclk_cnt = upper_32_bits(val);
113 	udelay(US_DELAY);
114 	val = read_freq_feedback();
115 	c->refclk_cnt = lower_32_bits(val);
116 	c->coreclk_cnt = upper_32_bits(val);
117 }
118 
119 /*
120  * Return instantaneous cpu speed
121  * Instantaneous freq is calculated as -
122  * -Takes sample on every query of getting the freq.
123  *	- Read core and ref clock counters;
124  *	- Delay for X us
125  *	- Read above cycle counters again
126  *	- Calculates freq by subtracting current and previous counters
127  *	  divided by the delay time or eqv. of ref_clk_counter in delta time
128  *	- Return Kcycles/second, freq in KHz
129  *
130  *	delta time period = x sec
131  *			  = delta ref_clk_counter / (408 * 10^6) sec
132  *	freq in Hz = cycles/sec
133  *		   = (delta cycles / x sec
134  *		   = (delta cycles * 408 * 10^6) / delta ref_clk_counter
135  *	in KHz	   = (delta cycles * 408 * 10^3) / delta ref_clk_counter
136  *
137  * @cpu - logical cpu whose freq to be updated
138  * Returns freq in KHz on success, 0 if cpu is offline
139  */
140 static unsigned int tegra194_calculate_speed(u32 cpu)
141 {
142 	struct read_counters_work read_counters_work;
143 	struct tegra_cpu_ctr c;
144 	u32 delta_refcnt;
145 	u32 delta_ccnt;
146 	u32 rate_mhz;
147 
148 	/*
149 	 * udelay() is required to reconstruct cpu frequency over an
150 	 * observation window. Using workqueue to call udelay() with
151 	 * interrupts enabled.
152 	 */
153 	read_counters_work.c.cpu = cpu;
154 	INIT_WORK_ONSTACK(&read_counters_work.work, tegra_read_counters);
155 	queue_work_on(cpu, read_counters_wq, &read_counters_work.work);
156 	flush_work(&read_counters_work.work);
157 	c = read_counters_work.c;
158 
159 	if (c.coreclk_cnt < c.last_coreclk_cnt)
160 		delta_ccnt = c.coreclk_cnt + (MAX_CNT - c.last_coreclk_cnt);
161 	else
162 		delta_ccnt = c.coreclk_cnt - c.last_coreclk_cnt;
163 	if (!delta_ccnt)
164 		return 0;
165 
166 	/* ref clock is 32 bits */
167 	if (c.refclk_cnt < c.last_refclk_cnt)
168 		delta_refcnt = c.refclk_cnt + (MAX_CNT - c.last_refclk_cnt);
169 	else
170 		delta_refcnt = c.refclk_cnt - c.last_refclk_cnt;
171 	if (!delta_refcnt) {
172 		pr_debug("cpufreq: %d is idle, delta_refcnt: 0\n", cpu);
173 		return 0;
174 	}
175 	rate_mhz = ((unsigned long)(delta_ccnt * REF_CLK_MHZ)) / delta_refcnt;
176 
177 	return (rate_mhz * KHZ); /* in KHz */
178 }
179 
180 static void get_cpu_ndiv(void *ndiv)
181 {
182 	u64 ndiv_val;
183 
184 	asm volatile("mrs %0, s3_0_c15_c0_4" : "=r" (ndiv_val) : );
185 
186 	*(u64 *)ndiv = ndiv_val;
187 }
188 
189 static void set_cpu_ndiv(void *data)
190 {
191 	struct cpufreq_frequency_table *tbl = data;
192 	u64 ndiv_val = (u64)tbl->driver_data;
193 
194 	asm volatile("msr s3_0_c15_c0_4, %0" : : "r" (ndiv_val));
195 }
196 
197 static unsigned int tegra194_get_speed(u32 cpu)
198 {
199 	struct tegra194_cpufreq_data *data = cpufreq_get_driver_data();
200 	struct cpufreq_frequency_table *pos;
201 	unsigned int rate;
202 	u64 ndiv;
203 	int ret;
204 	u32 cl;
205 
206 	smp_call_function_single(cpu, get_cpu_cluster, &cl, true);
207 
208 	/* reconstruct actual cpu freq using counters */
209 	rate = tegra194_calculate_speed(cpu);
210 
211 	/* get last written ndiv value */
212 	ret = smp_call_function_single(cpu, get_cpu_ndiv, &ndiv, true);
213 	if (WARN_ON_ONCE(ret))
214 		return rate;
215 
216 	/*
217 	 * If the reconstructed frequency has acceptable delta from
218 	 * the last written value, then return freq corresponding
219 	 * to the last written ndiv value from freq_table. This is
220 	 * done to return consistent value.
221 	 */
222 	cpufreq_for_each_valid_entry(pos, data->tables[cl]) {
223 		if (pos->driver_data != ndiv)
224 			continue;
225 
226 		if (abs(pos->frequency - rate) > 115200) {
227 			pr_warn("cpufreq: cpu%d,cur:%u,set:%u,set ndiv:%llu\n",
228 				cpu, rate, pos->frequency, ndiv);
229 		} else {
230 			rate = pos->frequency;
231 		}
232 		break;
233 	}
234 	return rate;
235 }
236 
237 static int tegra194_cpufreq_init(struct cpufreq_policy *policy)
238 {
239 	struct tegra194_cpufreq_data *data = cpufreq_get_driver_data();
240 	u32 cpu;
241 	u32 cl;
242 
243 	smp_call_function_single(policy->cpu, get_cpu_cluster, &cl, true);
244 
245 	if (cl >= data->num_clusters || !data->tables[cl])
246 		return -EINVAL;
247 
248 	/* set same policy for all cpus in a cluster */
249 	for (cpu = (cl * 2); cpu < ((cl + 1) * 2); cpu++)
250 		cpumask_set_cpu(cpu, policy->cpus);
251 
252 	policy->freq_table = data->tables[cl];
253 	policy->cpuinfo.transition_latency = TEGRA_CPUFREQ_TRANSITION_LATENCY;
254 
255 	return 0;
256 }
257 
258 static int tegra194_cpufreq_set_target(struct cpufreq_policy *policy,
259 				       unsigned int index)
260 {
261 	struct cpufreq_frequency_table *tbl = policy->freq_table + index;
262 
263 	/*
264 	 * Each core writes frequency in per core register. Then both cores
265 	 * in a cluster run at same frequency which is the maximum frequency
266 	 * request out of the values requested by both cores in that cluster.
267 	 */
268 	on_each_cpu_mask(policy->cpus, set_cpu_ndiv, tbl, true);
269 
270 	return 0;
271 }
272 
273 static struct cpufreq_driver tegra194_cpufreq_driver = {
274 	.name = "tegra194",
275 	.flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
276 	.verify = cpufreq_generic_frequency_table_verify,
277 	.target_index = tegra194_cpufreq_set_target,
278 	.get = tegra194_get_speed,
279 	.init = tegra194_cpufreq_init,
280 	.attr = cpufreq_generic_attr,
281 };
282 
283 static void tegra194_cpufreq_free_resources(void)
284 {
285 	destroy_workqueue(read_counters_wq);
286 }
287 
288 static struct cpufreq_frequency_table *
289 init_freq_table(struct platform_device *pdev, struct tegra_bpmp *bpmp,
290 		unsigned int cluster_id)
291 {
292 	struct cpufreq_frequency_table *freq_table;
293 	struct mrq_cpu_ndiv_limits_response resp;
294 	unsigned int num_freqs, ndiv, delta_ndiv;
295 	struct mrq_cpu_ndiv_limits_request req;
296 	struct tegra_bpmp_message msg;
297 	u16 freq_table_step_size;
298 	int err, index;
299 
300 	memset(&req, 0, sizeof(req));
301 	req.cluster_id = cluster_id;
302 
303 	memset(&msg, 0, sizeof(msg));
304 	msg.mrq = MRQ_CPU_NDIV_LIMITS;
305 	msg.tx.data = &req;
306 	msg.tx.size = sizeof(req);
307 	msg.rx.data = &resp;
308 	msg.rx.size = sizeof(resp);
309 
310 	err = tegra_bpmp_transfer(bpmp, &msg);
311 	if (err)
312 		return ERR_PTR(err);
313 	if (msg.rx.ret == -BPMP_EINVAL) {
314 		/* Cluster not available */
315 		return NULL;
316 	}
317 	if (msg.rx.ret)
318 		return ERR_PTR(-EINVAL);
319 
320 	/*
321 	 * Make sure frequency table step is a multiple of mdiv to match
322 	 * vhint table granularity.
323 	 */
324 	freq_table_step_size = resp.mdiv *
325 			DIV_ROUND_UP(CPUFREQ_TBL_STEP_HZ, resp.ref_clk_hz);
326 
327 	dev_dbg(&pdev->dev, "cluster %d: frequency table step size: %d\n",
328 		cluster_id, freq_table_step_size);
329 
330 	delta_ndiv = resp.ndiv_max - resp.ndiv_min;
331 
332 	if (unlikely(delta_ndiv == 0)) {
333 		num_freqs = 1;
334 	} else {
335 		/* We store both ndiv_min and ndiv_max hence the +1 */
336 		num_freqs = delta_ndiv / freq_table_step_size + 1;
337 	}
338 
339 	num_freqs += (delta_ndiv % freq_table_step_size) ? 1 : 0;
340 
341 	freq_table = devm_kcalloc(&pdev->dev, num_freqs + 1,
342 				  sizeof(*freq_table), GFP_KERNEL);
343 	if (!freq_table)
344 		return ERR_PTR(-ENOMEM);
345 
346 	for (index = 0, ndiv = resp.ndiv_min;
347 			ndiv < resp.ndiv_max;
348 			index++, ndiv += freq_table_step_size) {
349 		freq_table[index].driver_data = ndiv;
350 		freq_table[index].frequency = map_ndiv_to_freq(&resp, ndiv);
351 	}
352 
353 	freq_table[index].driver_data = resp.ndiv_max;
354 	freq_table[index++].frequency = map_ndiv_to_freq(&resp, resp.ndiv_max);
355 	freq_table[index].frequency = CPUFREQ_TABLE_END;
356 
357 	return freq_table;
358 }
359 
360 static int tegra194_cpufreq_probe(struct platform_device *pdev)
361 {
362 	struct tegra194_cpufreq_data *data;
363 	struct tegra_bpmp *bpmp;
364 	int err, i;
365 
366 	data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
367 	if (!data)
368 		return -ENOMEM;
369 
370 	data->num_clusters = MAX_CLUSTERS;
371 	data->tables = devm_kcalloc(&pdev->dev, data->num_clusters,
372 				    sizeof(*data->tables), GFP_KERNEL);
373 	if (!data->tables)
374 		return -ENOMEM;
375 
376 	platform_set_drvdata(pdev, data);
377 
378 	bpmp = tegra_bpmp_get(&pdev->dev);
379 	if (IS_ERR(bpmp))
380 		return PTR_ERR(bpmp);
381 
382 	read_counters_wq = alloc_workqueue("read_counters_wq", __WQ_LEGACY, 1);
383 	if (!read_counters_wq) {
384 		dev_err(&pdev->dev, "fail to create_workqueue\n");
385 		err = -EINVAL;
386 		goto put_bpmp;
387 	}
388 
389 	for (i = 0; i < data->num_clusters; i++) {
390 		data->tables[i] = init_freq_table(pdev, bpmp, i);
391 		if (IS_ERR(data->tables[i])) {
392 			err = PTR_ERR(data->tables[i]);
393 			goto err_free_res;
394 		}
395 	}
396 
397 	tegra194_cpufreq_driver.driver_data = data;
398 
399 	err = cpufreq_register_driver(&tegra194_cpufreq_driver);
400 	if (!err)
401 		goto put_bpmp;
402 
403 err_free_res:
404 	tegra194_cpufreq_free_resources();
405 put_bpmp:
406 	tegra_bpmp_put(bpmp);
407 	return err;
408 }
409 
410 static int tegra194_cpufreq_remove(struct platform_device *pdev)
411 {
412 	cpufreq_unregister_driver(&tegra194_cpufreq_driver);
413 	tegra194_cpufreq_free_resources();
414 
415 	return 0;
416 }
417 
418 static const struct of_device_id tegra194_cpufreq_of_match[] = {
419 	{ .compatible = "nvidia,tegra194-ccplex", },
420 	{ /* sentinel */ }
421 };
422 MODULE_DEVICE_TABLE(of, tegra194_cpufreq_of_match);
423 
424 static struct platform_driver tegra194_ccplex_driver = {
425 	.driver = {
426 		.name = "tegra194-cpufreq",
427 		.of_match_table = tegra194_cpufreq_of_match,
428 	},
429 	.probe = tegra194_cpufreq_probe,
430 	.remove = tegra194_cpufreq_remove,
431 };
432 module_platform_driver(tegra194_ccplex_driver);
433 
434 MODULE_AUTHOR("Mikko Perttunen <mperttunen@nvidia.com>");
435 MODULE_AUTHOR("Sumit Gupta <sumitg@nvidia.com>");
436 MODULE_DESCRIPTION("NVIDIA Tegra194 cpufreq driver");
437 MODULE_LICENSE("GPL v2");
438