1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved
4  */
5 
6 #include <linux/cpu.h>
7 #include <linux/cpufreq.h>
8 #include <linux/delay.h>
9 #include <linux/dma-mapping.h>
10 #include <linux/module.h>
11 #include <linux/of.h>
12 #include <linux/of_platform.h>
13 #include <linux/platform_device.h>
14 #include <linux/slab.h>
15 
16 #include <asm/smp_plat.h>
17 
18 #include <soc/tegra/bpmp.h>
19 #include <soc/tegra/bpmp-abi.h>
20 
21 #define KHZ                     1000
22 #define REF_CLK_MHZ             408 /* 408 MHz */
23 #define US_DELAY                500
24 #define US_DELAY_MIN            2
25 #define CPUFREQ_TBL_STEP_HZ     (50 * KHZ * KHZ)
26 #define MAX_CNT                 ~0U
27 
28 /* cpufreq transisition latency */
29 #define TEGRA_CPUFREQ_TRANSITION_LATENCY (300 * 1000) /* unit in nanoseconds */
30 
31 enum cluster {
32 	CLUSTER0,
33 	CLUSTER1,
34 	CLUSTER2,
35 	CLUSTER3,
36 	MAX_CLUSTERS,
37 };
38 
39 struct tegra194_cpufreq_data {
40 	void __iomem *regs;
41 	size_t num_clusters;
42 	struct cpufreq_frequency_table **tables;
43 };
44 
45 struct tegra_cpu_ctr {
46 	u32 cpu;
47 	u32 delay;
48 	u32 coreclk_cnt, last_coreclk_cnt;
49 	u32 refclk_cnt, last_refclk_cnt;
50 };
51 
52 struct read_counters_work {
53 	struct work_struct work;
54 	struct tegra_cpu_ctr c;
55 };
56 
57 static struct workqueue_struct *read_counters_wq;
58 
59 static void get_cpu_cluster(void *cluster)
60 {
61 	u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
62 
63 	*((uint32_t *)cluster) = MPIDR_AFFINITY_LEVEL(mpidr, 1);
64 }
65 
66 /*
67  * Read per-core Read-only system register NVFREQ_FEEDBACK_EL1.
68  * The register provides frequency feedback information to
69  * determine the average actual frequency a core has run at over
70  * a period of time.
71  *	[31:0] PLLP counter: Counts at fixed frequency (408 MHz)
72  *	[63:32] Core clock counter: counts on every core clock cycle
73  *			where the core is architecturally clocking
74  */
75 static u64 read_freq_feedback(void)
76 {
77 	u64 val = 0;
78 
79 	asm volatile("mrs %0, s3_0_c15_c0_5" : "=r" (val) : );
80 
81 	return val;
82 }
83 
84 static inline u32 map_ndiv_to_freq(struct mrq_cpu_ndiv_limits_response
85 				   *nltbl, u16 ndiv)
86 {
87 	return nltbl->ref_clk_hz / KHZ * ndiv / (nltbl->pdiv * nltbl->mdiv);
88 }
89 
90 static void tegra_read_counters(struct work_struct *work)
91 {
92 	struct read_counters_work *read_counters_work;
93 	struct tegra_cpu_ctr *c;
94 	u64 val;
95 
96 	/*
97 	 * ref_clk_counter(32 bit counter) runs on constant clk,
98 	 * pll_p(408MHz).
99 	 * It will take = 2 ^ 32 / 408 MHz to overflow ref clk counter
100 	 *              = 10526880 usec = 10.527 sec to overflow
101 	 *
102 	 * Like wise core_clk_counter(32 bit counter) runs on core clock.
103 	 * It's synchronized to crab_clk (cpu_crab_clk) which runs at
104 	 * freq of cluster. Assuming max cluster clock ~2000MHz,
105 	 * It will take = 2 ^ 32 / 2000 MHz to overflow core clk counter
106 	 *              = ~2.147 sec to overflow
107 	 */
108 	read_counters_work = container_of(work, struct read_counters_work,
109 					  work);
110 	c = &read_counters_work->c;
111 
112 	val = read_freq_feedback();
113 	c->last_refclk_cnt = lower_32_bits(val);
114 	c->last_coreclk_cnt = upper_32_bits(val);
115 	udelay(c->delay);
116 	val = read_freq_feedback();
117 	c->refclk_cnt = lower_32_bits(val);
118 	c->coreclk_cnt = upper_32_bits(val);
119 }
120 
121 /*
122  * Return instantaneous cpu speed
123  * Instantaneous freq is calculated as -
124  * -Takes sample on every query of getting the freq.
125  *	- Read core and ref clock counters;
126  *	- Delay for X us
127  *	- Read above cycle counters again
128  *	- Calculates freq by subtracting current and previous counters
129  *	  divided by the delay time or eqv. of ref_clk_counter in delta time
130  *	- Return Kcycles/second, freq in KHz
131  *
132  *	delta time period = x sec
133  *			  = delta ref_clk_counter / (408 * 10^6) sec
134  *	freq in Hz = cycles/sec
135  *		   = (delta cycles / x sec
136  *		   = (delta cycles * 408 * 10^6) / delta ref_clk_counter
137  *	in KHz	   = (delta cycles * 408 * 10^3) / delta ref_clk_counter
138  *
139  * @cpu - logical cpu whose freq to be updated
140  * Returns freq in KHz on success, 0 if cpu is offline
141  */
142 static unsigned int tegra194_get_speed_common(u32 cpu, u32 delay)
143 {
144 	struct read_counters_work read_counters_work;
145 	struct tegra_cpu_ctr c;
146 	u32 delta_refcnt;
147 	u32 delta_ccnt;
148 	u32 rate_mhz;
149 
150 	/*
151 	 * udelay() is required to reconstruct cpu frequency over an
152 	 * observation window. Using workqueue to call udelay() with
153 	 * interrupts enabled.
154 	 */
155 	read_counters_work.c.cpu = cpu;
156 	read_counters_work.c.delay = delay;
157 	INIT_WORK_ONSTACK(&read_counters_work.work, tegra_read_counters);
158 	queue_work_on(cpu, read_counters_wq, &read_counters_work.work);
159 	flush_work(&read_counters_work.work);
160 	c = read_counters_work.c;
161 
162 	if (c.coreclk_cnt < c.last_coreclk_cnt)
163 		delta_ccnt = c.coreclk_cnt + (MAX_CNT - c.last_coreclk_cnt);
164 	else
165 		delta_ccnt = c.coreclk_cnt - c.last_coreclk_cnt;
166 	if (!delta_ccnt)
167 		return 0;
168 
169 	/* ref clock is 32 bits */
170 	if (c.refclk_cnt < c.last_refclk_cnt)
171 		delta_refcnt = c.refclk_cnt + (MAX_CNT - c.last_refclk_cnt);
172 	else
173 		delta_refcnt = c.refclk_cnt - c.last_refclk_cnt;
174 	if (!delta_refcnt) {
175 		pr_debug("cpufreq: %d is idle, delta_refcnt: 0\n", cpu);
176 		return 0;
177 	}
178 	rate_mhz = ((unsigned long)(delta_ccnt * REF_CLK_MHZ)) / delta_refcnt;
179 
180 	return (rate_mhz * KHZ); /* in KHz */
181 }
182 
183 static unsigned int tegra194_get_speed(u32 cpu)
184 {
185 	return tegra194_get_speed_common(cpu, US_DELAY);
186 }
187 
188 static int tegra194_cpufreq_init(struct cpufreq_policy *policy)
189 {
190 	struct tegra194_cpufreq_data *data = cpufreq_get_driver_data();
191 	u32 cpu;
192 	u32 cl;
193 
194 	smp_call_function_single(policy->cpu, get_cpu_cluster, &cl, true);
195 
196 	if (cl >= data->num_clusters)
197 		return -EINVAL;
198 
199 	/* boot freq */
200 	policy->cur = tegra194_get_speed_common(policy->cpu, US_DELAY_MIN);
201 
202 	/* set same policy for all cpus in a cluster */
203 	for (cpu = (cl * 2); cpu < ((cl + 1) * 2); cpu++)
204 		cpumask_set_cpu(cpu, policy->cpus);
205 
206 	policy->freq_table = data->tables[cl];
207 	policy->cpuinfo.transition_latency = TEGRA_CPUFREQ_TRANSITION_LATENCY;
208 
209 	return 0;
210 }
211 
212 static void set_cpu_ndiv(void *data)
213 {
214 	struct cpufreq_frequency_table *tbl = data;
215 	u64 ndiv_val = (u64)tbl->driver_data;
216 
217 	asm volatile("msr s3_0_c15_c0_4, %0" : : "r" (ndiv_val));
218 }
219 
220 static int tegra194_cpufreq_set_target(struct cpufreq_policy *policy,
221 				       unsigned int index)
222 {
223 	struct cpufreq_frequency_table *tbl = policy->freq_table + index;
224 
225 	/*
226 	 * Each core writes frequency in per core register. Then both cores
227 	 * in a cluster run at same frequency which is the maximum frequency
228 	 * request out of the values requested by both cores in that cluster.
229 	 */
230 	on_each_cpu_mask(policy->cpus, set_cpu_ndiv, tbl, true);
231 
232 	return 0;
233 }
234 
235 static struct cpufreq_driver tegra194_cpufreq_driver = {
236 	.name = "tegra194",
237 	.flags = CPUFREQ_STICKY | CPUFREQ_CONST_LOOPS |
238 		CPUFREQ_NEED_INITIAL_FREQ_CHECK,
239 	.verify = cpufreq_generic_frequency_table_verify,
240 	.target_index = tegra194_cpufreq_set_target,
241 	.get = tegra194_get_speed,
242 	.init = tegra194_cpufreq_init,
243 	.attr = cpufreq_generic_attr,
244 };
245 
246 static void tegra194_cpufreq_free_resources(void)
247 {
248 	destroy_workqueue(read_counters_wq);
249 }
250 
251 static struct cpufreq_frequency_table *
252 init_freq_table(struct platform_device *pdev, struct tegra_bpmp *bpmp,
253 		unsigned int cluster_id)
254 {
255 	struct cpufreq_frequency_table *freq_table;
256 	struct mrq_cpu_ndiv_limits_response resp;
257 	unsigned int num_freqs, ndiv, delta_ndiv;
258 	struct mrq_cpu_ndiv_limits_request req;
259 	struct tegra_bpmp_message msg;
260 	u16 freq_table_step_size;
261 	int err, index;
262 
263 	memset(&req, 0, sizeof(req));
264 	req.cluster_id = cluster_id;
265 
266 	memset(&msg, 0, sizeof(msg));
267 	msg.mrq = MRQ_CPU_NDIV_LIMITS;
268 	msg.tx.data = &req;
269 	msg.tx.size = sizeof(req);
270 	msg.rx.data = &resp;
271 	msg.rx.size = sizeof(resp);
272 
273 	err = tegra_bpmp_transfer(bpmp, &msg);
274 	if (err)
275 		return ERR_PTR(err);
276 
277 	/*
278 	 * Make sure frequency table step is a multiple of mdiv to match
279 	 * vhint table granularity.
280 	 */
281 	freq_table_step_size = resp.mdiv *
282 			DIV_ROUND_UP(CPUFREQ_TBL_STEP_HZ, resp.ref_clk_hz);
283 
284 	dev_dbg(&pdev->dev, "cluster %d: frequency table step size: %d\n",
285 		cluster_id, freq_table_step_size);
286 
287 	delta_ndiv = resp.ndiv_max - resp.ndiv_min;
288 
289 	if (unlikely(delta_ndiv == 0)) {
290 		num_freqs = 1;
291 	} else {
292 		/* We store both ndiv_min and ndiv_max hence the +1 */
293 		num_freqs = delta_ndiv / freq_table_step_size + 1;
294 	}
295 
296 	num_freqs += (delta_ndiv % freq_table_step_size) ? 1 : 0;
297 
298 	freq_table = devm_kcalloc(&pdev->dev, num_freqs + 1,
299 				  sizeof(*freq_table), GFP_KERNEL);
300 	if (!freq_table)
301 		return ERR_PTR(-ENOMEM);
302 
303 	for (index = 0, ndiv = resp.ndiv_min;
304 			ndiv < resp.ndiv_max;
305 			index++, ndiv += freq_table_step_size) {
306 		freq_table[index].driver_data = ndiv;
307 		freq_table[index].frequency = map_ndiv_to_freq(&resp, ndiv);
308 	}
309 
310 	freq_table[index].driver_data = resp.ndiv_max;
311 	freq_table[index++].frequency = map_ndiv_to_freq(&resp, resp.ndiv_max);
312 	freq_table[index].frequency = CPUFREQ_TABLE_END;
313 
314 	return freq_table;
315 }
316 
317 static int tegra194_cpufreq_probe(struct platform_device *pdev)
318 {
319 	struct tegra194_cpufreq_data *data;
320 	struct tegra_bpmp *bpmp;
321 	int err, i;
322 
323 	data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
324 	if (!data)
325 		return -ENOMEM;
326 
327 	data->num_clusters = MAX_CLUSTERS;
328 	data->tables = devm_kcalloc(&pdev->dev, data->num_clusters,
329 				    sizeof(*data->tables), GFP_KERNEL);
330 	if (!data->tables)
331 		return -ENOMEM;
332 
333 	platform_set_drvdata(pdev, data);
334 
335 	bpmp = tegra_bpmp_get(&pdev->dev);
336 	if (IS_ERR(bpmp))
337 		return PTR_ERR(bpmp);
338 
339 	read_counters_wq = alloc_workqueue("read_counters_wq", __WQ_LEGACY, 1);
340 	if (!read_counters_wq) {
341 		dev_err(&pdev->dev, "fail to create_workqueue\n");
342 		err = -EINVAL;
343 		goto put_bpmp;
344 	}
345 
346 	for (i = 0; i < data->num_clusters; i++) {
347 		data->tables[i] = init_freq_table(pdev, bpmp, i);
348 		if (IS_ERR(data->tables[i])) {
349 			err = PTR_ERR(data->tables[i]);
350 			goto err_free_res;
351 		}
352 	}
353 
354 	tegra194_cpufreq_driver.driver_data = data;
355 
356 	err = cpufreq_register_driver(&tegra194_cpufreq_driver);
357 	if (!err)
358 		goto put_bpmp;
359 
360 err_free_res:
361 	tegra194_cpufreq_free_resources();
362 put_bpmp:
363 	tegra_bpmp_put(bpmp);
364 	return err;
365 }
366 
367 static int tegra194_cpufreq_remove(struct platform_device *pdev)
368 {
369 	cpufreq_unregister_driver(&tegra194_cpufreq_driver);
370 	tegra194_cpufreq_free_resources();
371 
372 	return 0;
373 }
374 
375 static const struct of_device_id tegra194_cpufreq_of_match[] = {
376 	{ .compatible = "nvidia,tegra194-ccplex", },
377 	{ /* sentinel */ }
378 };
379 MODULE_DEVICE_TABLE(of, tegra194_cpufreq_of_match);
380 
381 static struct platform_driver tegra194_ccplex_driver = {
382 	.driver = {
383 		.name = "tegra194-cpufreq",
384 		.of_match_table = tegra194_cpufreq_of_match,
385 	},
386 	.probe = tegra194_cpufreq_probe,
387 	.remove = tegra194_cpufreq_remove,
388 };
389 module_platform_driver(tegra194_ccplex_driver);
390 
391 MODULE_AUTHOR("Mikko Perttunen <mperttunen@nvidia.com>");
392 MODULE_AUTHOR("Sumit Gupta <sumitg@nvidia.com>");
393 MODULE_DESCRIPTION("NVIDIA Tegra194 cpufreq driver");
394 MODULE_LICENSE("GPL v2");
395