164f55e62SAndrey Grodzovsky /*
264f55e62SAndrey Grodzovsky * Copyright 2019 Advanced Micro Devices, Inc.
364f55e62SAndrey Grodzovsky *
464f55e62SAndrey Grodzovsky * Permission is hereby granted, free of charge, to any person obtaining a
564f55e62SAndrey Grodzovsky * copy of this software and associated documentation files (the "Software"),
664f55e62SAndrey Grodzovsky * to deal in the Software without restriction, including without limitation
764f55e62SAndrey Grodzovsky * the rights to use, copy, modify, merge, publish, distribute, sublicense,
864f55e62SAndrey Grodzovsky * and/or sell copies of the Software, and to permit persons to whom the
964f55e62SAndrey Grodzovsky * Software is furnished to do so, subject to the following conditions:
1064f55e62SAndrey Grodzovsky *
1164f55e62SAndrey Grodzovsky * The above copyright notice and this permission notice shall be included in
1264f55e62SAndrey Grodzovsky * all copies or substantial portions of the Software.
1364f55e62SAndrey Grodzovsky *
1464f55e62SAndrey Grodzovsky * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1564f55e62SAndrey Grodzovsky * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1664f55e62SAndrey Grodzovsky * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1764f55e62SAndrey Grodzovsky * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
1864f55e62SAndrey Grodzovsky * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
1964f55e62SAndrey Grodzovsky * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
2064f55e62SAndrey Grodzovsky * OTHER DEALINGS IN THE SOFTWARE.
2164f55e62SAndrey Grodzovsky *
2264f55e62SAndrey Grodzovsky */
2364f55e62SAndrey Grodzovsky
2464f55e62SAndrey Grodzovsky #include "amdgpu_ras_eeprom.h"
2564f55e62SAndrey Grodzovsky #include "amdgpu.h"
2664f55e62SAndrey Grodzovsky #include "amdgpu_ras.h"
2764f55e62SAndrey Grodzovsky #include <linux/bits.h>
28ef1caf48SJohn Clements #include "atom.h"
2924f55c05SAlex Deucher #include "amdgpu_eeprom.h"
3014fb496aSJohn Clements #include "amdgpu_atomfirmware.h"
31c65b0805SLuben Tuikov #include <linux/debugfs.h>
32c65b0805SLuben Tuikov #include <linux/uaccess.h>
3364f55e62SAndrey Grodzovsky
34d0fb18b5SAndrey Grodzovsky #include "amdgpu_reset.h"
35d0fb18b5SAndrey Grodzovsky
36da858deaSLuben Tuikov /* These are memory addresses as would be seen by one or more EEPROM
37da858deaSLuben Tuikov * chips strung on the I2C bus, usually by manipulating pins 1-3 of a
38da858deaSLuben Tuikov * set of EEPROM devices. They form a continuous memory space.
39da858deaSLuben Tuikov *
40da858deaSLuben Tuikov * The I2C device address includes the device type identifier, 1010b,
41da858deaSLuben Tuikov * which is a reserved value and indicates that this is an I2C EEPROM
42da858deaSLuben Tuikov * device. It also includes the top 3 bits of the 19 bit EEPROM memory
43da858deaSLuben Tuikov * address, namely bits 18, 17, and 16. This makes up the 7 bit
44da858deaSLuben Tuikov * address sent on the I2C bus with bit 0 being the direction bit,
45da858deaSLuben Tuikov * which is not represented here, and sent by the hardware directly.
46da858deaSLuben Tuikov *
47da858deaSLuben Tuikov * For instance,
48da858deaSLuben Tuikov * 50h = 1010000b => device type identifier 1010b, bits 18:16 = 000b, address 0.
49da858deaSLuben Tuikov * 54h = 1010100b => --"--, bits 18:16 = 100b, address 40000h.
50da858deaSLuben Tuikov * 56h = 1010110b => --"--, bits 18:16 = 110b, address 60000h.
51da858deaSLuben Tuikov * Depending on the size of the I2C EEPROM device(s), bits 18:16 may
52da858deaSLuben Tuikov * address memory in a device or a device on the I2C bus, depending on
53da858deaSLuben Tuikov * the status of pins 1-3. See top of amdgpu_eeprom.c.
543b8164f8SLuben Tuikov *
553b8164f8SLuben Tuikov * The RAS table lives either at address 0 or address 40000h of EEPROM.
56da858deaSLuben Tuikov */
573b8164f8SLuben Tuikov #define EEPROM_I2C_MADDR_0 0x0
583b8164f8SLuben Tuikov #define EEPROM_I2C_MADDR_4 0x40000
5964f55e62SAndrey Grodzovsky
6064f55e62SAndrey Grodzovsky /*
6164f55e62SAndrey Grodzovsky * The 2 macros bellow represent the actual size in bytes that
6264f55e62SAndrey Grodzovsky * those entities occupy in the EEPROM memory.
63c28aa44dSLuben Tuikov * RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which
6464f55e62SAndrey Grodzovsky * uses uint64 to store 6b fields such as retired_page.
6564f55e62SAndrey Grodzovsky */
66c28aa44dSLuben Tuikov #define RAS_TABLE_HEADER_SIZE 20
67c28aa44dSLuben Tuikov #define RAS_TABLE_RECORD_SIZE 24
6864f55e62SAndrey Grodzovsky
6964f55e62SAndrey Grodzovsky /* Table hdr is 'AMDR' */
70c28aa44dSLuben Tuikov #define RAS_TABLE_HDR_VAL 0x414d4452
7164f55e62SAndrey Grodzovsky
721d6a9d12SGuchun Chen /* Bad GPU tag ‘BADG’ */
73c28aa44dSLuben Tuikov #define RAS_TABLE_HDR_BAD 0x42414447
741d6a9d12SGuchun Chen
75e06da817SSrinivasan Shanmugam /*
767f599fedSStanley.Yang * EEPROM Table structure v1
777f599fedSStanley.Yang * ---------------------------------
787f599fedSStanley.Yang * | |
797f599fedSStanley.Yang * | EEPROM TABLE HEADER |
807f599fedSStanley.Yang * | ( size 20 Bytes ) |
817f599fedSStanley.Yang * | |
827f599fedSStanley.Yang * ---------------------------------
837f599fedSStanley.Yang * | |
847f599fedSStanley.Yang * | BAD PAGE RECORD AREA |
857f599fedSStanley.Yang * | |
867f599fedSStanley.Yang * ---------------------------------
877f599fedSStanley.Yang */
887f599fedSStanley.Yang
89c28aa44dSLuben Tuikov /* Assume 2-Mbit size EEPROM and take up the whole space. */
90c28aa44dSLuben Tuikov #define RAS_TBL_SIZE_BYTES (256 * 1024)
9163d4c081SLuben Tuikov #define RAS_TABLE_START 0
9263d4c081SLuben Tuikov #define RAS_HDR_START RAS_TABLE_START
93c28aa44dSLuben Tuikov #define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE)
94e4e6a589SLuben Tuikov #define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
95c28aa44dSLuben Tuikov / RAS_TABLE_RECORD_SIZE)
9664f55e62SAndrey Grodzovsky
97e06da817SSrinivasan Shanmugam /*
987f599fedSStanley.Yang * EEPROM Table structrue v2.1
997f599fedSStanley.Yang * ---------------------------------
1007f599fedSStanley.Yang * | |
1017f599fedSStanley.Yang * | EEPROM TABLE HEADER |
1027f599fedSStanley.Yang * | ( size 20 Bytes ) |
1037f599fedSStanley.Yang * | |
1047f599fedSStanley.Yang * ---------------------------------
1057f599fedSStanley.Yang * | |
1067f599fedSStanley.Yang * | EEPROM TABLE RAS INFO |
1077f599fedSStanley.Yang * | (available info size 4 Bytes) |
1087f599fedSStanley.Yang * | ( reserved size 252 Bytes ) |
1097f599fedSStanley.Yang * | |
1107f599fedSStanley.Yang * ---------------------------------
1117f599fedSStanley.Yang * | |
1127f599fedSStanley.Yang * | BAD PAGE RECORD AREA |
1137f599fedSStanley.Yang * | |
1147f599fedSStanley.Yang * ---------------------------------
1157f599fedSStanley.Yang */
1167f599fedSStanley.Yang
11765183faeSStanley.Yang /* EEPROM Table V2_1 */
11865183faeSStanley.Yang #define RAS_TABLE_V2_1_INFO_SIZE 256
11965183faeSStanley.Yang #define RAS_TABLE_V2_1_INFO_START RAS_TABLE_HEADER_SIZE
12065183faeSStanley.Yang #define RAS_RECORD_START_V2_1 (RAS_HDR_START + RAS_TABLE_HEADER_SIZE + \
12165183faeSStanley.Yang RAS_TABLE_V2_1_INFO_SIZE)
12265183faeSStanley.Yang #define RAS_MAX_RECORD_COUNT_V2_1 ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE - \
12365183faeSStanley.Yang RAS_TABLE_V2_1_INFO_SIZE) \
12465183faeSStanley.Yang / RAS_TABLE_RECORD_SIZE)
12565183faeSStanley.Yang
12663d4c081SLuben Tuikov /* Given a zero-based index of an EEPROM RAS record, yields the EEPROM
12763d4c081SLuben Tuikov * offset off of RAS_TABLE_START. That is, this is something you can
12863d4c081SLuben Tuikov * add to control->i2c_address, and then tell I2C layer to read
12963d4c081SLuben Tuikov * from/write to there. _N is the so called absolute index,
13063d4c081SLuben Tuikov * because it starts right after the table header.
13163d4c081SLuben Tuikov */
13263d4c081SLuben Tuikov #define RAS_INDEX_TO_OFFSET(_C, _N) ((_C)->ras_record_offset + \
13363d4c081SLuben Tuikov (_N) * RAS_TABLE_RECORD_SIZE)
13463d4c081SLuben Tuikov
13563d4c081SLuben Tuikov #define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \
13663d4c081SLuben Tuikov (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE)
13763d4c081SLuben Tuikov
138c65b0805SLuben Tuikov /* Given a 0-based relative record index, 0, 1, 2, ..., etc., off
139c65b0805SLuben Tuikov * of "fri", return the absolute record index off of the end of
140c65b0805SLuben Tuikov * the table header.
141c65b0805SLuben Tuikov */
142c65b0805SLuben Tuikov #define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \
143c65b0805SLuben Tuikov (_C)->ras_max_record_count)
144c65b0805SLuben Tuikov
14563d4c081SLuben Tuikov #define RAS_NUM_RECS(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
14663d4c081SLuben Tuikov RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE)
14763d4c081SLuben Tuikov
14865183faeSStanley.Yang #define RAS_NUM_RECS_V2_1(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
14965183faeSStanley.Yang RAS_TABLE_HEADER_SIZE - \
15065183faeSStanley.Yang RAS_TABLE_V2_1_INFO_SIZE) / RAS_TABLE_RECORD_SIZE)
15165183faeSStanley.Yang
15264f55e62SAndrey Grodzovsky #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev
15364f55e62SAndrey Grodzovsky
__is_ras_eeprom_supported(struct amdgpu_device * adev)1544bfb7428SJohn Clements static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
1554bfb7428SJohn Clements {
156bc22f8ecSCandice Li switch (adev->ip_versions[MP1_HWIP][0]) {
1576246059aSAlex Deucher case IP_VERSION(11, 0, 2): /* VEGA20 and ARCTURUS */
1586246059aSAlex Deucher case IP_VERSION(11, 0, 7): /* Sienna cichlid */
159bc22f8ecSCandice Li case IP_VERSION(13, 0, 0):
1606246059aSAlex Deucher case IP_VERSION(13, 0, 2): /* Aldebaran */
161bc22f8ecSCandice Li case IP_VERSION(13, 0, 10):
162bc22f8ecSCandice Li return true;
1634b721ed8SCandice Li case IP_VERSION(13, 0, 6):
1644b721ed8SCandice Li return (adev->gmc.is_app_apu) ? false : true;
165bc22f8ecSCandice Li default:
166bc22f8ecSCandice Li return false;
167bc22f8ecSCandice Li }
168bc22f8ecSCandice Li }
169bc22f8ecSCandice Li
__get_eeprom_i2c_addr(struct amdgpu_device * adev,struct amdgpu_ras_eeprom_control * control)170ef1caf48SJohn Clements static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
171ccdfbfecSLuben Tuikov struct amdgpu_ras_eeprom_control *control)
172ef1caf48SJohn Clements {
17364a3dbb0SLuben Tuikov struct atom_context *atom_ctx = adev->mode_info.atom_context;
174cc947bf9SLuben Tuikov u8 i2c_addr;
175cc947bf9SLuben Tuikov
176ccdfbfecSLuben Tuikov if (!control)
177ef1caf48SJohn Clements return false;
178ef1caf48SJohn Clements
179cc947bf9SLuben Tuikov if (amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) {
180cc947bf9SLuben Tuikov /* The address given by VBIOS is an 8-bit, wire-format
181cc947bf9SLuben Tuikov * address, i.e. the most significant byte.
182cc947bf9SLuben Tuikov *
183cc947bf9SLuben Tuikov * Normalize it to a 19-bit EEPROM address. Remove the
184cc947bf9SLuben Tuikov * device type identifier and make it a 7-bit address;
185cc947bf9SLuben Tuikov * then make it a 19-bit EEPROM address. See top of
186cc947bf9SLuben Tuikov * amdgpu_eeprom.c.
187cc947bf9SLuben Tuikov */
188cc947bf9SLuben Tuikov i2c_addr = (i2c_addr & 0x0F) >> 1;
189cc947bf9SLuben Tuikov control->i2c_address = ((u32) i2c_addr) << 16;
19014fb496aSJohn Clements
19114fb496aSJohn Clements return true;
19214fb496aSJohn Clements }
19314fb496aSJohn Clements
1946246059aSAlex Deucher switch (adev->ip_versions[MP1_HWIP][0]) {
1956246059aSAlex Deucher case IP_VERSION(11, 0, 2):
1966246059aSAlex Deucher /* VEGA20 and ARCTURUS */
1976246059aSAlex Deucher if (adev->asic_type == CHIP_VEGA20)
1986246059aSAlex Deucher control->i2c_address = EEPROM_I2C_MADDR_0;
199adf64e21SMario Limonciello else if (strnstr(atom_ctx->vbios_pn,
2006246059aSAlex Deucher "D342",
201adf64e21SMario Limonciello sizeof(atom_ctx->vbios_pn)))
2026246059aSAlex Deucher control->i2c_address = EEPROM_I2C_MADDR_0;
2036246059aSAlex Deucher else
2046246059aSAlex Deucher control->i2c_address = EEPROM_I2C_MADDR_4;
2056246059aSAlex Deucher return true;
2066246059aSAlex Deucher case IP_VERSION(11, 0, 7):
2073b8164f8SLuben Tuikov control->i2c_address = EEPROM_I2C_MADDR_0;
2088782007bSLuben Tuikov return true;
2096246059aSAlex Deucher case IP_VERSION(13, 0, 2):
210adf64e21SMario Limonciello if (strnstr(atom_ctx->vbios_pn, "D673",
211adf64e21SMario Limonciello sizeof(atom_ctx->vbios_pn)))
21264a3dbb0SLuben Tuikov control->i2c_address = EEPROM_I2C_MADDR_4;
21364a3dbb0SLuben Tuikov else
2143b8164f8SLuben Tuikov control->i2c_address = EEPROM_I2C_MADDR_0;
2158782007bSLuben Tuikov return true;
2166246059aSAlex Deucher case IP_VERSION(13, 0, 0):
217*c4307207SCandice Li if (strnstr(atom_ctx->vbios_pn, "D707",
218*c4307207SCandice Li sizeof(atom_ctx->vbios_pn)))
219*c4307207SCandice Li control->i2c_address = EEPROM_I2C_MADDR_0;
220*c4307207SCandice Li else
221*c4307207SCandice Li control->i2c_address = EEPROM_I2C_MADDR_4;
222*c4307207SCandice Li return true;
223b81fde0dSCandice Li case IP_VERSION(13, 0, 6):
2246246059aSAlex Deucher case IP_VERSION(13, 0, 10):
2256246059aSAlex Deucher control->i2c_address = EEPROM_I2C_MADDR_4;
2266246059aSAlex Deucher return true;
227ef1caf48SJohn Clements default:
228ef1caf48SJohn Clements return false;
229ef1caf48SJohn Clements }
230ef1caf48SJohn Clements }
231ef1caf48SJohn Clements
23263d4c081SLuben Tuikov static void
__encode_table_header_to_buf(struct amdgpu_ras_eeprom_table_header * hdr,unsigned char * buf)23363d4c081SLuben Tuikov __encode_table_header_to_buf(struct amdgpu_ras_eeprom_table_header *hdr,
234d7edde3dSLuben Tuikov unsigned char *buf)
23564f55e62SAndrey Grodzovsky {
23663d4c081SLuben Tuikov u32 *pp = (uint32_t *)buf;
23764f55e62SAndrey Grodzovsky
23864f55e62SAndrey Grodzovsky pp[0] = cpu_to_le32(hdr->header);
23964f55e62SAndrey Grodzovsky pp[1] = cpu_to_le32(hdr->version);
24064f55e62SAndrey Grodzovsky pp[2] = cpu_to_le32(hdr->first_rec_offset);
24164f55e62SAndrey Grodzovsky pp[3] = cpu_to_le32(hdr->tbl_size);
24264f55e62SAndrey Grodzovsky pp[4] = cpu_to_le32(hdr->checksum);
24364f55e62SAndrey Grodzovsky }
24464f55e62SAndrey Grodzovsky
24563d4c081SLuben Tuikov static void
__decode_table_header_from_buf(struct amdgpu_ras_eeprom_table_header * hdr,unsigned char * buf)24663d4c081SLuben Tuikov __decode_table_header_from_buf(struct amdgpu_ras_eeprom_table_header *hdr,
247d7edde3dSLuben Tuikov unsigned char *buf)
24864f55e62SAndrey Grodzovsky {
249d7edde3dSLuben Tuikov u32 *pp = (uint32_t *)buf;
25064f55e62SAndrey Grodzovsky
25164f55e62SAndrey Grodzovsky hdr->header = le32_to_cpu(pp[0]);
25264f55e62SAndrey Grodzovsky hdr->version = le32_to_cpu(pp[1]);
25364f55e62SAndrey Grodzovsky hdr->first_rec_offset = le32_to_cpu(pp[2]);
25464f55e62SAndrey Grodzovsky hdr->tbl_size = le32_to_cpu(pp[3]);
25564f55e62SAndrey Grodzovsky hdr->checksum = le32_to_cpu(pp[4]);
25664f55e62SAndrey Grodzovsky }
25764f55e62SAndrey Grodzovsky
__write_table_header(struct amdgpu_ras_eeprom_control * control)25863d4c081SLuben Tuikov static int __write_table_header(struct amdgpu_ras_eeprom_control *control)
25964f55e62SAndrey Grodzovsky {
26063d4c081SLuben Tuikov u8 buf[RAS_TABLE_HEADER_SIZE];
2619015d60cSAndrey Grodzovsky struct amdgpu_device *adev = to_amdgpu_device(control);
26263d4c081SLuben Tuikov int res;
26364f55e62SAndrey Grodzovsky
26463d4c081SLuben Tuikov memset(buf, 0, sizeof(buf));
265d7edde3dSLuben Tuikov __encode_table_header_to_buf(&control->tbl_hdr, buf);
2665985ebbeSJohn Clements
26740e7ed97SDennis Li /* i2c may be unstable in gpu reset */
268d0fb18b5SAndrey Grodzovsky down_read(&adev->reset_domain->sem);
2692f60dd50SLuben Tuikov res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
27063d4c081SLuben Tuikov control->i2c_address +
27163d4c081SLuben Tuikov control->ras_header_offset,
272d7edde3dSLuben Tuikov buf, RAS_TABLE_HEADER_SIZE);
273d0fb18b5SAndrey Grodzovsky up_read(&adev->reset_domain->sem);
27440e7ed97SDennis Li
27563d4c081SLuben Tuikov if (res < 0) {
27663d4c081SLuben Tuikov DRM_ERROR("Failed to write EEPROM table header:%d", res);
27763d4c081SLuben Tuikov } else if (res < RAS_TABLE_HEADER_SIZE) {
27863d4c081SLuben Tuikov DRM_ERROR("Short write:%d out of %d\n",
27963d4c081SLuben Tuikov res, RAS_TABLE_HEADER_SIZE);
28063d4c081SLuben Tuikov res = -EIO;
28163d4c081SLuben Tuikov } else {
28263d4c081SLuben Tuikov res = 0;
28363d4c081SLuben Tuikov }
28464f55e62SAndrey Grodzovsky
28563d4c081SLuben Tuikov return res;
28664f55e62SAndrey Grodzovsky }
28764f55e62SAndrey Grodzovsky
2887f599fedSStanley.Yang static void
__encode_table_ras_info_to_buf(struct amdgpu_ras_eeprom_table_ras_info * rai,unsigned char * buf)2897f599fedSStanley.Yang __encode_table_ras_info_to_buf(struct amdgpu_ras_eeprom_table_ras_info *rai,
2907f599fedSStanley.Yang unsigned char *buf)
2917f599fedSStanley.Yang {
2927f599fedSStanley.Yang u32 *pp = (uint32_t *)buf;
2937f599fedSStanley.Yang u32 tmp;
2947f599fedSStanley.Yang
2957f599fedSStanley.Yang tmp = ((uint32_t)(rai->rma_status) & 0xFF) |
2967f599fedSStanley.Yang (((uint32_t)(rai->health_percent) << 8) & 0xFF00) |
2977f599fedSStanley.Yang (((uint32_t)(rai->ecc_page_threshold) << 16) & 0xFFFF0000);
2987f599fedSStanley.Yang pp[0] = cpu_to_le32(tmp);
2997f599fedSStanley.Yang }
3007f599fedSStanley.Yang
3017f599fedSStanley.Yang static void
__decode_table_ras_info_from_buf(struct amdgpu_ras_eeprom_table_ras_info * rai,unsigned char * buf)3027f599fedSStanley.Yang __decode_table_ras_info_from_buf(struct amdgpu_ras_eeprom_table_ras_info *rai,
3037f599fedSStanley.Yang unsigned char *buf)
3047f599fedSStanley.Yang {
3057f599fedSStanley.Yang u32 *pp = (uint32_t *)buf;
3067f599fedSStanley.Yang u32 tmp;
3077f599fedSStanley.Yang
3087f599fedSStanley.Yang tmp = le32_to_cpu(pp[0]);
3097f599fedSStanley.Yang rai->rma_status = tmp & 0xFF;
3107f599fedSStanley.Yang rai->health_percent = (tmp >> 8) & 0xFF;
3117f599fedSStanley.Yang rai->ecc_page_threshold = (tmp >> 16) & 0xFFFF;
3127f599fedSStanley.Yang }
3137f599fedSStanley.Yang
__write_table_ras_info(struct amdgpu_ras_eeprom_control * control)3147f599fedSStanley.Yang static int __write_table_ras_info(struct amdgpu_ras_eeprom_control *control)
3157f599fedSStanley.Yang {
3167f599fedSStanley.Yang struct amdgpu_device *adev = to_amdgpu_device(control);
3177f599fedSStanley.Yang u8 *buf;
3187f599fedSStanley.Yang int res;
3197f599fedSStanley.Yang
3207f599fedSStanley.Yang buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL);
3217f599fedSStanley.Yang if (!buf) {
3227f599fedSStanley.Yang DRM_ERROR("Failed to alloc buf to write table ras info\n");
3237f599fedSStanley.Yang return -ENOMEM;
3247f599fedSStanley.Yang }
3257f599fedSStanley.Yang
3267f599fedSStanley.Yang __encode_table_ras_info_to_buf(&control->tbl_rai, buf);
3277f599fedSStanley.Yang
3287f599fedSStanley.Yang /* i2c may be unstable in gpu reset */
3297f599fedSStanley.Yang down_read(&adev->reset_domain->sem);
3307f599fedSStanley.Yang res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
3317f599fedSStanley.Yang control->i2c_address +
3327f599fedSStanley.Yang control->ras_info_offset,
3337f599fedSStanley.Yang buf, RAS_TABLE_V2_1_INFO_SIZE);
3347f599fedSStanley.Yang up_read(&adev->reset_domain->sem);
3357f599fedSStanley.Yang
3367f599fedSStanley.Yang if (res < 0) {
3377f599fedSStanley.Yang DRM_ERROR("Failed to write EEPROM table ras info:%d", res);
3387f599fedSStanley.Yang } else if (res < RAS_TABLE_V2_1_INFO_SIZE) {
3397f599fedSStanley.Yang DRM_ERROR("Short write:%d out of %d\n",
3407f599fedSStanley.Yang res, RAS_TABLE_V2_1_INFO_SIZE);
3417f599fedSStanley.Yang res = -EIO;
3427f599fedSStanley.Yang } else {
3437f599fedSStanley.Yang res = 0;
3447f599fedSStanley.Yang }
3457f599fedSStanley.Yang
3467f599fedSStanley.Yang kfree(buf);
3477f599fedSStanley.Yang
3487f599fedSStanley.Yang return res;
3497f599fedSStanley.Yang }
3507f599fedSStanley.Yang
__calc_hdr_byte_sum(const struct amdgpu_ras_eeprom_control * control)351803c6ebdSLuben Tuikov static u8 __calc_hdr_byte_sum(const struct amdgpu_ras_eeprom_control *control)
352db338e16SAndrey Grodzovsky {
35363d4c081SLuben Tuikov int ii;
35463d4c081SLuben Tuikov u8 *pp, csum;
355803c6ebdSLuben Tuikov size_t sz;
356db338e16SAndrey Grodzovsky
357db338e16SAndrey Grodzovsky /* Header checksum, skip checksum field in the calculation */
358803c6ebdSLuben Tuikov sz = sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum);
35963d4c081SLuben Tuikov pp = (u8 *) &control->tbl_hdr;
36063d4c081SLuben Tuikov csum = 0;
36163d4c081SLuben Tuikov for (ii = 0; ii < sz; ii++, pp++)
36263d4c081SLuben Tuikov csum += *pp;
363db338e16SAndrey Grodzovsky
36463d4c081SLuben Tuikov return csum;
365db338e16SAndrey Grodzovsky }
36664f55e62SAndrey Grodzovsky
__calc_ras_info_byte_sum(const struct amdgpu_ras_eeprom_control * control)3677c2551faSStanley.Yang static u8 __calc_ras_info_byte_sum(const struct amdgpu_ras_eeprom_control *control)
3687c2551faSStanley.Yang {
3697c2551faSStanley.Yang int ii;
3707c2551faSStanley.Yang u8 *pp, csum;
3717c2551faSStanley.Yang size_t sz;
3727c2551faSStanley.Yang
3737c2551faSStanley.Yang sz = sizeof(control->tbl_rai);
3747c2551faSStanley.Yang pp = (u8 *) &control->tbl_rai;
3757c2551faSStanley.Yang csum = 0;
3767c2551faSStanley.Yang for (ii = 0; ii < sz; ii++, pp++)
3777c2551faSStanley.Yang csum += *pp;
3787c2551faSStanley.Yang
3797c2551faSStanley.Yang return csum;
3807c2551faSStanley.Yang }
3817c2551faSStanley.Yang
amdgpu_ras_eeprom_correct_header_tag(struct amdgpu_ras_eeprom_control * control,uint32_t header)3829b856defSGuchun Chen static int amdgpu_ras_eeprom_correct_header_tag(
3839b856defSGuchun Chen struct amdgpu_ras_eeprom_control *control,
3849b856defSGuchun Chen uint32_t header)
3859b856defSGuchun Chen {
3869b856defSGuchun Chen struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
38763d4c081SLuben Tuikov u8 *hh;
38863d4c081SLuben Tuikov int res;
38963d4c081SLuben Tuikov u8 csum;
3909b856defSGuchun Chen
39163d4c081SLuben Tuikov csum = -hdr->checksum;
3929b856defSGuchun Chen
39363d4c081SLuben Tuikov hh = (void *) &hdr->header;
39463d4c081SLuben Tuikov csum -= (hh[0] + hh[1] + hh[2] + hh[3]);
39563d4c081SLuben Tuikov hh = (void *) &header;
39663d4c081SLuben Tuikov csum += hh[0] + hh[1] + hh[2] + hh[3];
39763d4c081SLuben Tuikov csum = -csum;
3980686627bSLuben Tuikov mutex_lock(&control->ras_tbl_mutex);
3999b856defSGuchun Chen hdr->header = header;
40063d4c081SLuben Tuikov hdr->checksum = csum;
40163d4c081SLuben Tuikov res = __write_table_header(control);
4020686627bSLuben Tuikov mutex_unlock(&control->ras_tbl_mutex);
4039b856defSGuchun Chen
40463d4c081SLuben Tuikov return res;
4059b856defSGuchun Chen }
4069b856defSGuchun Chen
40763d4c081SLuben Tuikov /**
40863d4c081SLuben Tuikov * amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table
40963d4c081SLuben Tuikov * @control: pointer to control structure
41063d4c081SLuben Tuikov *
41163d4c081SLuben Tuikov * Reset the contents of the header of the RAS EEPROM table.
41263d4c081SLuben Tuikov * Return 0 on success, -errno on error.
41363d4c081SLuben Tuikov */
amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control * control)414d01b400bSAndrey Grodzovsky int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
415d01b400bSAndrey Grodzovsky {
4168bbd4d83SStanley.Yang struct amdgpu_device *adev = to_amdgpu_device(control);
417d01b400bSAndrey Grodzovsky struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
4180bc3137bSStanley.Yang struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
41969691c82SStanley.Yang struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
42063d4c081SLuben Tuikov u8 csum;
42163d4c081SLuben Tuikov int res;
422db338e16SAndrey Grodzovsky
4230686627bSLuben Tuikov mutex_lock(&control->ras_tbl_mutex);
424d01b400bSAndrey Grodzovsky
425c28aa44dSLuben Tuikov hdr->header = RAS_TABLE_HDR_VAL;
4267f599fedSStanley.Yang if (adev->umc.ras &&
4277f599fedSStanley.Yang adev->umc.ras->set_eeprom_table_version)
4287f599fedSStanley.Yang adev->umc.ras->set_eeprom_table_version(hdr);
4297f599fedSStanley.Yang else
43071c79a19SStanley.Yang hdr->version = RAS_TABLE_VER_V1;
4317f599fedSStanley.Yang
4327f599fedSStanley.Yang if (hdr->version == RAS_TABLE_VER_V2_1) {
4337f599fedSStanley.Yang hdr->first_rec_offset = RAS_RECORD_START_V2_1;
4347f599fedSStanley.Yang hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
4357f599fedSStanley.Yang RAS_TABLE_V2_1_INFO_SIZE;
4360bc3137bSStanley.Yang rai->rma_status = GPU_HEALTH_USABLE;
4370bc3137bSStanley.Yang /**
4380bc3137bSStanley.Yang * GPU health represented as a percentage.
4390bc3137bSStanley.Yang * 0 means worst health, 100 means fully health.
4400bc3137bSStanley.Yang */
4410bc3137bSStanley.Yang rai->health_percent = 100;
4420bc3137bSStanley.Yang /* ecc_page_threshold = 0 means disable bad page retirement */
4430bc3137bSStanley.Yang rai->ecc_page_threshold = con->bad_page_cnt_threshold;
4447f599fedSStanley.Yang } else {
445c28aa44dSLuben Tuikov hdr->first_rec_offset = RAS_RECORD_START;
446c28aa44dSLuben Tuikov hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
4477f599fedSStanley.Yang }
448d01b400bSAndrey Grodzovsky
44963d4c081SLuben Tuikov csum = __calc_hdr_byte_sum(control);
4507c2551faSStanley.Yang if (hdr->version == RAS_TABLE_VER_V2_1)
4517c2551faSStanley.Yang csum += __calc_ras_info_byte_sum(control);
45263d4c081SLuben Tuikov csum = -csum;
45363d4c081SLuben Tuikov hdr->checksum = csum;
45463d4c081SLuben Tuikov res = __write_table_header(control);
4557f599fedSStanley.Yang if (!res && hdr->version > RAS_TABLE_VER_V1)
4567f599fedSStanley.Yang res = __write_table_ras_info(control);
45763d4c081SLuben Tuikov
45863d4c081SLuben Tuikov control->ras_num_recs = 0;
45963d4c081SLuben Tuikov control->ras_fri = 0;
460db338e16SAndrey Grodzovsky
4618bbd4d83SStanley.Yang amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
4628bbd4d83SStanley.Yang
46369691c82SStanley.Yang control->bad_channel_bitmap = 0;
46469691c82SStanley.Yang amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
46569691c82SStanley.Yang con->update_channel_flag = false;
46669691c82SStanley.Yang
467c65b0805SLuben Tuikov amdgpu_ras_debugfs_set_ret_size(control);
468c65b0805SLuben Tuikov
4690686627bSLuben Tuikov mutex_unlock(&control->ras_tbl_mutex);
470db338e16SAndrey Grodzovsky
47163d4c081SLuben Tuikov return res;
472d01b400bSAndrey Grodzovsky }
473d01b400bSAndrey Grodzovsky
47463d4c081SLuben Tuikov static void
__encode_table_record_to_buf(struct amdgpu_ras_eeprom_control * control,struct eeprom_table_record * record,unsigned char * buf)47563d4c081SLuben Tuikov __encode_table_record_to_buf(struct amdgpu_ras_eeprom_control *control,
47664f55e62SAndrey Grodzovsky struct eeprom_table_record *record,
477d7edde3dSLuben Tuikov unsigned char *buf)
47864f55e62SAndrey Grodzovsky {
47964f55e62SAndrey Grodzovsky __le64 tmp = 0;
48064f55e62SAndrey Grodzovsky int i = 0;
48164f55e62SAndrey Grodzovsky
48264f55e62SAndrey Grodzovsky /* Next are all record fields according to EEPROM page spec in LE foramt */
483d7edde3dSLuben Tuikov buf[i++] = record->err_type;
48464f55e62SAndrey Grodzovsky
485d7edde3dSLuben Tuikov buf[i++] = record->bank;
48664f55e62SAndrey Grodzovsky
48764f55e62SAndrey Grodzovsky tmp = cpu_to_le64(record->ts);
488d7edde3dSLuben Tuikov memcpy(buf + i, &tmp, 8);
48964f55e62SAndrey Grodzovsky i += 8;
49064f55e62SAndrey Grodzovsky
49164f55e62SAndrey Grodzovsky tmp = cpu_to_le64((record->offset & 0xffffffffffff));
492d7edde3dSLuben Tuikov memcpy(buf + i, &tmp, 6);
49364f55e62SAndrey Grodzovsky i += 6;
49464f55e62SAndrey Grodzovsky
495d7edde3dSLuben Tuikov buf[i++] = record->mem_channel;
496d7edde3dSLuben Tuikov buf[i++] = record->mcumc_id;
49764f55e62SAndrey Grodzovsky
49864f55e62SAndrey Grodzovsky tmp = cpu_to_le64((record->retired_page & 0xffffffffffff));
499d7edde3dSLuben Tuikov memcpy(buf + i, &tmp, 6);
50064f55e62SAndrey Grodzovsky }
50164f55e62SAndrey Grodzovsky
50263d4c081SLuben Tuikov static void
__decode_table_record_from_buf(struct amdgpu_ras_eeprom_control * control,struct eeprom_table_record * record,unsigned char * buf)50363d4c081SLuben Tuikov __decode_table_record_from_buf(struct amdgpu_ras_eeprom_control *control,
50464f55e62SAndrey Grodzovsky struct eeprom_table_record *record,
505d7edde3dSLuben Tuikov unsigned char *buf)
50664f55e62SAndrey Grodzovsky {
50764f55e62SAndrey Grodzovsky __le64 tmp = 0;
50864f55e62SAndrey Grodzovsky int i = 0;
50964f55e62SAndrey Grodzovsky
51064f55e62SAndrey Grodzovsky /* Next are all record fields according to EEPROM page spec in LE foramt */
511d7edde3dSLuben Tuikov record->err_type = buf[i++];
51264f55e62SAndrey Grodzovsky
513d7edde3dSLuben Tuikov record->bank = buf[i++];
51464f55e62SAndrey Grodzovsky
515d7edde3dSLuben Tuikov memcpy(&tmp, buf + i, 8);
51664f55e62SAndrey Grodzovsky record->ts = le64_to_cpu(tmp);
51764f55e62SAndrey Grodzovsky i += 8;
51864f55e62SAndrey Grodzovsky
519d7edde3dSLuben Tuikov memcpy(&tmp, buf + i, 6);
52064f55e62SAndrey Grodzovsky record->offset = (le64_to_cpu(tmp) & 0xffffffffffff);
52164f55e62SAndrey Grodzovsky i += 6;
52264f55e62SAndrey Grodzovsky
523d7edde3dSLuben Tuikov record->mem_channel = buf[i++];
524d7edde3dSLuben Tuikov record->mcumc_id = buf[i++];
52564f55e62SAndrey Grodzovsky
526d7edde3dSLuben Tuikov memcpy(&tmp, buf + i, 6);
52764f55e62SAndrey Grodzovsky record->retired_page = (le64_to_cpu(tmp) & 0xffffffffffff);
52864f55e62SAndrey Grodzovsky }
52964f55e62SAndrey Grodzovsky
amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device * adev)53011003c68SDennis Li bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
531e8fbaf03SGuchun Chen {
53211003c68SDennis Li struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
533e8fbaf03SGuchun Chen
53422106ed0STao Zhou if (!__is_ras_eeprom_supported(adev) ||
53522106ed0STao Zhou !amdgpu_bad_page_threshold)
53611003c68SDennis Li return false;
5374bfb7428SJohn Clements
538970fd197SStanley.Yang /* skip check eeprom table for VEGA20 Gaming */
539970fd197SStanley.Yang if (!con)
540970fd197SStanley.Yang return false;
541970fd197SStanley.Yang else
542970fd197SStanley.Yang if (!(con->features & BIT(AMDGPU_RAS_BLOCK__UMC)))
543970fd197SStanley.Yang return false;
544970fd197SStanley.Yang
545c28aa44dSLuben Tuikov if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
54622106ed0STao Zhou if (amdgpu_bad_page_threshold == -1) {
54722106ed0STao Zhou dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
54822106ed0STao Zhou con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
54922106ed0STao Zhou dev_warn(adev->dev,
55022106ed0STao Zhou "But GPU can be operated due to bad_page_threshold = -1.\n");
55122106ed0STao Zhou return false;
55222106ed0STao Zhou } else {
553e8fbaf03SGuchun Chen dev_warn(adev->dev, "This GPU is in BAD status.");
55463d4c081SLuben Tuikov dev_warn(adev->dev, "Please retire it or set a larger "
555e8fbaf03SGuchun Chen "threshold value when reloading driver.\n");
55611003c68SDennis Li return true;
557e8fbaf03SGuchun Chen }
55822106ed0STao Zhou }
559e8fbaf03SGuchun Chen
56011003c68SDennis Li return false;
561e8fbaf03SGuchun Chen }
562e8fbaf03SGuchun Chen
56363d4c081SLuben Tuikov /**
56463d4c081SLuben Tuikov * __amdgpu_ras_eeprom_write -- write indexed from buffer to EEPROM
56563d4c081SLuben Tuikov * @control: pointer to control structure
56663d4c081SLuben Tuikov * @buf: pointer to buffer containing data to write
56763d4c081SLuben Tuikov * @fri: start writing at this index
56863d4c081SLuben Tuikov * @num: number of records to write
56963d4c081SLuben Tuikov *
57063d4c081SLuben Tuikov * The caller must hold the table mutex in @control.
57163d4c081SLuben Tuikov * Return 0 on success, -errno otherwise.
57263d4c081SLuben Tuikov */
__amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control * control,u8 * buf,const u32 fri,const u32 num)57363d4c081SLuben Tuikov static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
57463d4c081SLuben Tuikov u8 *buf, const u32 fri, const u32 num)
57564f55e62SAndrey Grodzovsky {
57663d4c081SLuben Tuikov struct amdgpu_device *adev = to_amdgpu_device(control);
57763d4c081SLuben Tuikov u32 buf_size;
57863d4c081SLuben Tuikov int res;
57963d4c081SLuben Tuikov
58063d4c081SLuben Tuikov /* i2c may be unstable in gpu reset */
581d0fb18b5SAndrey Grodzovsky down_read(&adev->reset_domain->sem);
58263d4c081SLuben Tuikov buf_size = num * RAS_TABLE_RECORD_SIZE;
5832f60dd50SLuben Tuikov res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
58463d4c081SLuben Tuikov control->i2c_address +
58563d4c081SLuben Tuikov RAS_INDEX_TO_OFFSET(control, fri),
58663d4c081SLuben Tuikov buf, buf_size);
587d0fb18b5SAndrey Grodzovsky up_read(&adev->reset_domain->sem);
58863d4c081SLuben Tuikov if (res < 0) {
58963d4c081SLuben Tuikov DRM_ERROR("Writing %d EEPROM table records error:%d",
59063d4c081SLuben Tuikov num, res);
59163d4c081SLuben Tuikov } else if (res < buf_size) {
59263d4c081SLuben Tuikov /* Short write, return error.
59363d4c081SLuben Tuikov */
59463d4c081SLuben Tuikov DRM_ERROR("Wrote %d records out of %d",
59563d4c081SLuben Tuikov res / RAS_TABLE_RECORD_SIZE, num);
59663d4c081SLuben Tuikov res = -EIO;
59763d4c081SLuben Tuikov } else {
59863d4c081SLuben Tuikov res = 0;
59963d4c081SLuben Tuikov }
60063d4c081SLuben Tuikov
60163d4c081SLuben Tuikov return res;
60263d4c081SLuben Tuikov }
60363d4c081SLuben Tuikov
60463d4c081SLuben Tuikov static int
amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control * control,struct eeprom_table_record * record,const u32 num)60563d4c081SLuben Tuikov amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
60663d4c081SLuben Tuikov struct eeprom_table_record *record,
60763d4c081SLuben Tuikov const u32 num)
60863d4c081SLuben Tuikov {
60969691c82SStanley.Yang struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
61063d4c081SLuben Tuikov u32 a, b, i;
61163d4c081SLuben Tuikov u8 *buf, *pp;
61263d4c081SLuben Tuikov int res;
61363d4c081SLuben Tuikov
61463d4c081SLuben Tuikov buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
61563d4c081SLuben Tuikov if (!buf)
61663d4c081SLuben Tuikov return -ENOMEM;
61763d4c081SLuben Tuikov
61863d4c081SLuben Tuikov /* Encode all of them in one go.
61963d4c081SLuben Tuikov */
62063d4c081SLuben Tuikov pp = buf;
62169691c82SStanley.Yang for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
62263d4c081SLuben Tuikov __encode_table_record_to_buf(control, &record[i], pp);
62363d4c081SLuben Tuikov
62469691c82SStanley.Yang /* update bad channel bitmap */
62569691c82SStanley.Yang if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
62669691c82SStanley.Yang control->bad_channel_bitmap |= 1 << record[i].mem_channel;
62769691c82SStanley.Yang con->update_channel_flag = true;
62869691c82SStanley.Yang }
62969691c82SStanley.Yang }
63069691c82SStanley.Yang
63163d4c081SLuben Tuikov /* a, first record index to write into.
63263d4c081SLuben Tuikov * b, last record index to write into.
63363d4c081SLuben Tuikov * a = first index to read (fri) + number of records in the table,
63463d4c081SLuben Tuikov * b = a + @num - 1.
63563d4c081SLuben Tuikov * Let N = control->ras_max_num_record_count, then we have,
63663d4c081SLuben Tuikov * case 0: 0 <= a <= b < N,
63763d4c081SLuben Tuikov * just append @num records starting at a;
63863d4c081SLuben Tuikov * case 1: 0 <= a < N <= b,
63963d4c081SLuben Tuikov * append (N - a) records starting at a, and
64063d4c081SLuben Tuikov * append the remainder, b % N + 1, starting at 0.
64163d4c081SLuben Tuikov * case 2: 0 <= fri < N <= a <= b, then modulo N we get two subcases,
64263d4c081SLuben Tuikov * case 2a: 0 <= a <= b < N
64363d4c081SLuben Tuikov * append num records starting at a; and fix fri if b overwrote it,
64463d4c081SLuben Tuikov * and since a <= b, if b overwrote it then a must've also,
64563d4c081SLuben Tuikov * and if b didn't overwrite it, then a didn't also.
64663d4c081SLuben Tuikov * case 2b: 0 <= b < a < N
64763d4c081SLuben Tuikov * write num records starting at a, which wraps around 0=N
64863d4c081SLuben Tuikov * and overwrite fri unconditionally. Now from case 2a,
64963d4c081SLuben Tuikov * this means that b eclipsed fri to overwrite it and wrap
65063d4c081SLuben Tuikov * around 0 again, i.e. b = 2N+r pre modulo N, so we unconditionally
65163d4c081SLuben Tuikov * set fri = b + 1 (mod N).
65263d4c081SLuben Tuikov * Now, since fri is updated in every case, except the trivial case 0,
65363d4c081SLuben Tuikov * the number of records present in the table after writing, is,
65463d4c081SLuben Tuikov * num_recs - 1 = b - fri (mod N), and we take the positive value,
65563d4c081SLuben Tuikov * by adding an arbitrary multiple of N before taking the modulo N
65663d4c081SLuben Tuikov * as shown below.
65763d4c081SLuben Tuikov */
65863d4c081SLuben Tuikov a = control->ras_fri + control->ras_num_recs;
65963d4c081SLuben Tuikov b = a + num - 1;
66063d4c081SLuben Tuikov if (b < control->ras_max_record_count) {
66163d4c081SLuben Tuikov res = __amdgpu_ras_eeprom_write(control, buf, a, num);
66263d4c081SLuben Tuikov } else if (a < control->ras_max_record_count) {
66363d4c081SLuben Tuikov u32 g0, g1;
66463d4c081SLuben Tuikov
66563d4c081SLuben Tuikov g0 = control->ras_max_record_count - a;
66663d4c081SLuben Tuikov g1 = b % control->ras_max_record_count + 1;
66763d4c081SLuben Tuikov res = __amdgpu_ras_eeprom_write(control, buf, a, g0);
66863d4c081SLuben Tuikov if (res)
66963d4c081SLuben Tuikov goto Out;
67063d4c081SLuben Tuikov res = __amdgpu_ras_eeprom_write(control,
67163d4c081SLuben Tuikov buf + g0 * RAS_TABLE_RECORD_SIZE,
67263d4c081SLuben Tuikov 0, g1);
67363d4c081SLuben Tuikov if (res)
67463d4c081SLuben Tuikov goto Out;
67563d4c081SLuben Tuikov if (g1 > control->ras_fri)
67663d4c081SLuben Tuikov control->ras_fri = g1 % control->ras_max_record_count;
67763d4c081SLuben Tuikov } else {
67863d4c081SLuben Tuikov a %= control->ras_max_record_count;
67963d4c081SLuben Tuikov b %= control->ras_max_record_count;
68063d4c081SLuben Tuikov
68163d4c081SLuben Tuikov if (a <= b) {
68263d4c081SLuben Tuikov /* Note that, b - a + 1 = num. */
68363d4c081SLuben Tuikov res = __amdgpu_ras_eeprom_write(control, buf, a, num);
68463d4c081SLuben Tuikov if (res)
68563d4c081SLuben Tuikov goto Out;
68663d4c081SLuben Tuikov if (b >= control->ras_fri)
68763d4c081SLuben Tuikov control->ras_fri = (b + 1) % control->ras_max_record_count;
68863d4c081SLuben Tuikov } else {
68963d4c081SLuben Tuikov u32 g0, g1;
69063d4c081SLuben Tuikov
69163d4c081SLuben Tuikov /* b < a, which means, we write from
69263d4c081SLuben Tuikov * a to the end of the table, and from
69363d4c081SLuben Tuikov * the start of the table to b.
69463d4c081SLuben Tuikov */
69563d4c081SLuben Tuikov g0 = control->ras_max_record_count - a;
69663d4c081SLuben Tuikov g1 = b + 1;
69763d4c081SLuben Tuikov res = __amdgpu_ras_eeprom_write(control, buf, a, g0);
69863d4c081SLuben Tuikov if (res)
69963d4c081SLuben Tuikov goto Out;
70063d4c081SLuben Tuikov res = __amdgpu_ras_eeprom_write(control,
70163d4c081SLuben Tuikov buf + g0 * RAS_TABLE_RECORD_SIZE,
70263d4c081SLuben Tuikov 0, g1);
70363d4c081SLuben Tuikov if (res)
70463d4c081SLuben Tuikov goto Out;
70563d4c081SLuben Tuikov control->ras_fri = g1 % control->ras_max_record_count;
70663d4c081SLuben Tuikov }
70763d4c081SLuben Tuikov }
70863d4c081SLuben Tuikov control->ras_num_recs = 1 + (control->ras_max_record_count + b
70963d4c081SLuben Tuikov - control->ras_fri)
71063d4c081SLuben Tuikov % control->ras_max_record_count;
71163d4c081SLuben Tuikov Out:
71263d4c081SLuben Tuikov kfree(buf);
71363d4c081SLuben Tuikov return res;
71463d4c081SLuben Tuikov }
71563d4c081SLuben Tuikov
71663d4c081SLuben Tuikov static int
amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control * control)71763d4c081SLuben Tuikov amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
71863d4c081SLuben Tuikov {
71964f55e62SAndrey Grodzovsky struct amdgpu_device *adev = to_amdgpu_device(control);
7209c06f91fSGuchun Chen struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
72163d4c081SLuben Tuikov u8 *buf, *pp, csum;
72263d4c081SLuben Tuikov u32 buf_size;
72363d4c081SLuben Tuikov int res;
72463d4c081SLuben Tuikov
72563d4c081SLuben Tuikov /* Modify the header if it exceeds.
72663d4c081SLuben Tuikov */
72763d4c081SLuben Tuikov if (amdgpu_bad_page_threshold != 0 &&
72863d4c081SLuben Tuikov control->ras_num_recs >= ras->bad_page_cnt_threshold) {
72963d4c081SLuben Tuikov dev_warn(adev->dev,
73063d4c081SLuben Tuikov "Saved bad pages %d reaches threshold value %d\n",
73163d4c081SLuben Tuikov control->ras_num_recs, ras->bad_page_cnt_threshold);
73263d4c081SLuben Tuikov control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
7330bc3137bSStanley.Yang if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
7340bc3137bSStanley.Yang control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
7350bc3137bSStanley.Yang control->tbl_rai.health_percent = 0;
7360bc3137bSStanley.Yang }
73763d4c081SLuben Tuikov }
73863d4c081SLuben Tuikov
7397f599fedSStanley.Yang if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
7407f599fedSStanley.Yang control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
7417f599fedSStanley.Yang RAS_TABLE_V2_1_INFO_SIZE +
7427f599fedSStanley.Yang control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
7437f599fedSStanley.Yang else
7447f599fedSStanley.Yang control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
7457f599fedSStanley.Yang control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
74663d4c081SLuben Tuikov control->tbl_hdr.checksum = 0;
74763d4c081SLuben Tuikov
74863d4c081SLuben Tuikov buf_size = control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
74963d4c081SLuben Tuikov buf = kcalloc(control->ras_num_recs, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
75063d4c081SLuben Tuikov if (!buf) {
75163d4c081SLuben Tuikov DRM_ERROR("allocating memory for table of size %d bytes failed\n",
75263d4c081SLuben Tuikov control->tbl_hdr.tbl_size);
75363d4c081SLuben Tuikov res = -ENOMEM;
75463d4c081SLuben Tuikov goto Out;
75563d4c081SLuben Tuikov }
75663d4c081SLuben Tuikov
757d0fb18b5SAndrey Grodzovsky down_read(&adev->reset_domain->sem);
7582f60dd50SLuben Tuikov res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
75963d4c081SLuben Tuikov control->i2c_address +
76063d4c081SLuben Tuikov control->ras_record_offset,
76163d4c081SLuben Tuikov buf, buf_size);
762d0fb18b5SAndrey Grodzovsky up_read(&adev->reset_domain->sem);
76363d4c081SLuben Tuikov if (res < 0) {
76463d4c081SLuben Tuikov DRM_ERROR("EEPROM failed reading records:%d\n",
76563d4c081SLuben Tuikov res);
76663d4c081SLuben Tuikov goto Out;
76763d4c081SLuben Tuikov } else if (res < buf_size) {
76863d4c081SLuben Tuikov DRM_ERROR("EEPROM read %d out of %d bytes\n",
76963d4c081SLuben Tuikov res, buf_size);
77063d4c081SLuben Tuikov res = -EIO;
77163d4c081SLuben Tuikov goto Out;
77263d4c081SLuben Tuikov }
77363d4c081SLuben Tuikov
7740bc3137bSStanley.Yang /**
7750bc3137bSStanley.Yang * bad page records have been stored in eeprom,
7760bc3137bSStanley.Yang * now calculate gpu health percent
7770bc3137bSStanley.Yang */
7780bc3137bSStanley.Yang if (amdgpu_bad_page_threshold != 0 &&
7790bc3137bSStanley.Yang control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
7800bc3137bSStanley.Yang control->ras_num_recs < ras->bad_page_cnt_threshold)
7810bc3137bSStanley.Yang control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
7820bc3137bSStanley.Yang control->ras_num_recs) * 100) /
7830bc3137bSStanley.Yang ras->bad_page_cnt_threshold;
7840bc3137bSStanley.Yang
78563d4c081SLuben Tuikov /* Recalc the checksum.
78663d4c081SLuben Tuikov */
78763d4c081SLuben Tuikov csum = 0;
78863d4c081SLuben Tuikov for (pp = buf; pp < buf + buf_size; pp++)
78963d4c081SLuben Tuikov csum += *pp;
79063d4c081SLuben Tuikov
79163d4c081SLuben Tuikov csum += __calc_hdr_byte_sum(control);
7927c2551faSStanley.Yang if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
7937c2551faSStanley.Yang csum += __calc_ras_info_byte_sum(control);
79463d4c081SLuben Tuikov /* avoid sign extension when assigning to "checksum" */
79563d4c081SLuben Tuikov csum = -csum;
79663d4c081SLuben Tuikov control->tbl_hdr.checksum = csum;
79763d4c081SLuben Tuikov res = __write_table_header(control);
7987f599fedSStanley.Yang if (!res && control->tbl_hdr.version > RAS_TABLE_VER_V1)
7997f599fedSStanley.Yang res = __write_table_ras_info(control);
80063d4c081SLuben Tuikov Out:
80163d4c081SLuben Tuikov kfree(buf);
80263d4c081SLuben Tuikov return res;
80363d4c081SLuben Tuikov }
80463d4c081SLuben Tuikov
80563d4c081SLuben Tuikov /**
80663d4c081SLuben Tuikov * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table
80763d4c081SLuben Tuikov * @control: pointer to control structure
80863d4c081SLuben Tuikov * @record: array of records to append
80963d4c081SLuben Tuikov * @num: number of records in @record array
81063d4c081SLuben Tuikov *
81163d4c081SLuben Tuikov * Append @num records to the table, calculate the checksum and write
81263d4c081SLuben Tuikov * the table back to EEPROM. The maximum number of records that
81363d4c081SLuben Tuikov * can be appended is between 1 and control->ras_max_record_count,
81463d4c081SLuben Tuikov * regardless of how many records are already stored in the table.
81563d4c081SLuben Tuikov *
81663d4c081SLuben Tuikov * Return 0 on success or if EEPROM is not supported, -errno on error.
81763d4c081SLuben Tuikov */
amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control * control,struct eeprom_table_record * record,const u32 num)81863d4c081SLuben Tuikov int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
81963d4c081SLuben Tuikov struct eeprom_table_record *record,
82063d4c081SLuben Tuikov const u32 num)
82163d4c081SLuben Tuikov {
82263d4c081SLuben Tuikov struct amdgpu_device *adev = to_amdgpu_device(control);
82363d4c081SLuben Tuikov int res;
82464f55e62SAndrey Grodzovsky
8254bfb7428SJohn Clements if (!__is_ras_eeprom_supported(adev))
82664f55e62SAndrey Grodzovsky return 0;
82764f55e62SAndrey Grodzovsky
82863d4c081SLuben Tuikov if (num == 0) {
82963d4c081SLuben Tuikov DRM_ERROR("will not append 0 records\n");
83063d4c081SLuben Tuikov return -EINVAL;
83163d4c081SLuben Tuikov } else if (num > control->ras_max_record_count) {
83263d4c081SLuben Tuikov DRM_ERROR("cannot append %d records than the size of table %d\n",
83363d4c081SLuben Tuikov num, control->ras_max_record_count);
83463d4c081SLuben Tuikov return -EINVAL;
83563d4c081SLuben Tuikov }
83664f55e62SAndrey Grodzovsky
8370686627bSLuben Tuikov mutex_lock(&control->ras_tbl_mutex);
83864f55e62SAndrey Grodzovsky
83963d4c081SLuben Tuikov res = amdgpu_ras_eeprom_append_table(control, record, num);
84063d4c081SLuben Tuikov if (!res)
84163d4c081SLuben Tuikov res = amdgpu_ras_eeprom_update_header(control);
842c65b0805SLuben Tuikov if (!res)
843c65b0805SLuben Tuikov amdgpu_ras_debugfs_set_ret_size(control);
84463d4c081SLuben Tuikov
84563d4c081SLuben Tuikov mutex_unlock(&control->ras_tbl_mutex);
84663d4c081SLuben Tuikov return res;
8479c06f91fSGuchun Chen }
8489c06f91fSGuchun Chen
84963d4c081SLuben Tuikov /**
85063d4c081SLuben Tuikov * __amdgpu_ras_eeprom_read -- read indexed from EEPROM into buffer
85163d4c081SLuben Tuikov * @control: pointer to control structure
85263d4c081SLuben Tuikov * @buf: pointer to buffer to read into
85363d4c081SLuben Tuikov * @fri: first record index, start reading at this index, absolute index
85463d4c081SLuben Tuikov * @num: number of records to read
85563d4c081SLuben Tuikov *
85663d4c081SLuben Tuikov * The caller must hold the table mutex in @control.
85763d4c081SLuben Tuikov * Return 0 on success, -errno otherwise.
85864f55e62SAndrey Grodzovsky */
__amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control * control,u8 * buf,const u32 fri,const u32 num)85963d4c081SLuben Tuikov static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
86063d4c081SLuben Tuikov u8 *buf, const u32 fri, const u32 num)
86163d4c081SLuben Tuikov {
86263d4c081SLuben Tuikov struct amdgpu_device *adev = to_amdgpu_device(control);
86363d4c081SLuben Tuikov u32 buf_size;
86463d4c081SLuben Tuikov int res;
86564f55e62SAndrey Grodzovsky
86624f55c05SAlex Deucher /* i2c may be unstable in gpu reset */
867d0fb18b5SAndrey Grodzovsky down_read(&adev->reset_domain->sem);
86863d4c081SLuben Tuikov buf_size = num * RAS_TABLE_RECORD_SIZE;
8692f60dd50SLuben Tuikov res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
87063d4c081SLuben Tuikov control->i2c_address +
87163d4c081SLuben Tuikov RAS_INDEX_TO_OFFSET(control, fri),
87263d4c081SLuben Tuikov buf, buf_size);
873d0fb18b5SAndrey Grodzovsky up_read(&adev->reset_domain->sem);
87463d4c081SLuben Tuikov if (res < 0) {
87563d4c081SLuben Tuikov DRM_ERROR("Reading %d EEPROM table records error:%d",
87663d4c081SLuben Tuikov num, res);
87763d4c081SLuben Tuikov } else if (res < buf_size) {
87863d4c081SLuben Tuikov /* Short read, return error.
87964f55e62SAndrey Grodzovsky */
88063d4c081SLuben Tuikov DRM_ERROR("Read %d records out of %d",
88163d4c081SLuben Tuikov res / RAS_TABLE_RECORD_SIZE, num);
88263d4c081SLuben Tuikov res = -EIO;
88363d4c081SLuben Tuikov } else {
88463d4c081SLuben Tuikov res = 0;
88564f55e62SAndrey Grodzovsky }
88664f55e62SAndrey Grodzovsky
88763d4c081SLuben Tuikov return res;
88864f55e62SAndrey Grodzovsky }
88964f55e62SAndrey Grodzovsky
89063d4c081SLuben Tuikov /**
89163d4c081SLuben Tuikov * amdgpu_ras_eeprom_read -- read EEPROM
89263d4c081SLuben Tuikov * @control: pointer to control structure
89363d4c081SLuben Tuikov * @record: array of records to read into
89463d4c081SLuben Tuikov * @num: number of records in @record
89564f55e62SAndrey Grodzovsky *
89663d4c081SLuben Tuikov * Reads num records from the RAS table in EEPROM and
89763d4c081SLuben Tuikov * writes the data into @record array.
89863d4c081SLuben Tuikov *
89963d4c081SLuben Tuikov * Returns 0 on success, -errno on error.
90064f55e62SAndrey Grodzovsky */
amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control * control,struct eeprom_table_record * record,const u32 num)90163d4c081SLuben Tuikov int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
90263d4c081SLuben Tuikov struct eeprom_table_record *record,
90363d4c081SLuben Tuikov const u32 num)
90463d4c081SLuben Tuikov {
90563d4c081SLuben Tuikov struct amdgpu_device *adev = to_amdgpu_device(control);
90669691c82SStanley.Yang struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
90763d4c081SLuben Tuikov int i, res;
90863d4c081SLuben Tuikov u8 *buf, *pp;
90963d4c081SLuben Tuikov u32 g0, g1;
91064f55e62SAndrey Grodzovsky
91163d4c081SLuben Tuikov if (!__is_ras_eeprom_supported(adev))
91263d4c081SLuben Tuikov return 0;
91363d4c081SLuben Tuikov
91463d4c081SLuben Tuikov if (num == 0) {
91563d4c081SLuben Tuikov DRM_ERROR("will not read 0 records\n");
91663d4c081SLuben Tuikov return -EINVAL;
91763d4c081SLuben Tuikov } else if (num > control->ras_num_recs) {
91863d4c081SLuben Tuikov DRM_ERROR("too many records to read:%d available:%d\n",
91963d4c081SLuben Tuikov num, control->ras_num_recs);
92063d4c081SLuben Tuikov return -EINVAL;
92164f55e62SAndrey Grodzovsky }
92264f55e62SAndrey Grodzovsky
92363d4c081SLuben Tuikov buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
92463d4c081SLuben Tuikov if (!buf)
92563d4c081SLuben Tuikov return -ENOMEM;
92664f55e62SAndrey Grodzovsky
92763d4c081SLuben Tuikov /* Determine how many records to read, from the first record
92863d4c081SLuben Tuikov * index, fri, to the end of the table, and from the beginning
92963d4c081SLuben Tuikov * of the table, such that the total number of records is
93063d4c081SLuben Tuikov * @num, and we handle wrap around when fri > 0 and
93163d4c081SLuben Tuikov * fri + num > RAS_MAX_RECORD_COUNT.
93263d4c081SLuben Tuikov *
93363d4c081SLuben Tuikov * First we compute the index of the last element
93463d4c081SLuben Tuikov * which would be fetched from each region,
93563d4c081SLuben Tuikov * g0 is in [fri, fri + num - 1], and
93663d4c081SLuben Tuikov * g1 is in [0, RAS_MAX_RECORD_COUNT - 1].
93763d4c081SLuben Tuikov * Then, if g0 < RAS_MAX_RECORD_COUNT, the index of
93863d4c081SLuben Tuikov * the last element to fetch, we set g0 to _the number_
93963d4c081SLuben Tuikov * of elements to fetch, @num, since we know that the last
94063d4c081SLuben Tuikov * indexed to be fetched does not exceed the table.
94163d4c081SLuben Tuikov *
94263d4c081SLuben Tuikov * If, however, g0 >= RAS_MAX_RECORD_COUNT, then
94363d4c081SLuben Tuikov * we set g0 to the number of elements to read
94463d4c081SLuben Tuikov * until the end of the table, and g1 to the number of
94563d4c081SLuben Tuikov * elements to read from the beginning of the table.
94663d4c081SLuben Tuikov */
94763d4c081SLuben Tuikov g0 = control->ras_fri + num - 1;
94863d4c081SLuben Tuikov g1 = g0 % control->ras_max_record_count;
94963d4c081SLuben Tuikov if (g0 < control->ras_max_record_count) {
95063d4c081SLuben Tuikov g0 = num;
95163d4c081SLuben Tuikov g1 = 0;
95263d4c081SLuben Tuikov } else {
95363d4c081SLuben Tuikov g0 = control->ras_max_record_count - control->ras_fri;
95463d4c081SLuben Tuikov g1 += 1;
95563d4c081SLuben Tuikov }
95663d4c081SLuben Tuikov
95763d4c081SLuben Tuikov mutex_lock(&control->ras_tbl_mutex);
95863d4c081SLuben Tuikov res = __amdgpu_ras_eeprom_read(control, buf, control->ras_fri, g0);
95963d4c081SLuben Tuikov if (res)
96063d4c081SLuben Tuikov goto Out;
96163d4c081SLuben Tuikov if (g1) {
96263d4c081SLuben Tuikov res = __amdgpu_ras_eeprom_read(control,
96363d4c081SLuben Tuikov buf + g0 * RAS_TABLE_RECORD_SIZE,
96463d4c081SLuben Tuikov 0, g1);
96563d4c081SLuben Tuikov if (res)
96663d4c081SLuben Tuikov goto Out;
96763d4c081SLuben Tuikov }
96863d4c081SLuben Tuikov
96963d4c081SLuben Tuikov res = 0;
97063d4c081SLuben Tuikov
97163d4c081SLuben Tuikov /* Read up everything? Then transform.
97263d4c081SLuben Tuikov */
97363d4c081SLuben Tuikov pp = buf;
97469691c82SStanley.Yang for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
97563d4c081SLuben Tuikov __decode_table_record_from_buf(control, &record[i], pp);
97669691c82SStanley.Yang
97769691c82SStanley.Yang /* update bad channel bitmap */
97869691c82SStanley.Yang if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
97969691c82SStanley.Yang control->bad_channel_bitmap |= 1 << record[i].mem_channel;
98069691c82SStanley.Yang con->update_channel_flag = true;
98169691c82SStanley.Yang }
98269691c82SStanley.Yang }
98363d4c081SLuben Tuikov Out:
98463d4c081SLuben Tuikov kfree(buf);
9850686627bSLuben Tuikov mutex_unlock(&control->ras_tbl_mutex);
98664f55e62SAndrey Grodzovsky
98763d4c081SLuben Tuikov return res;
9881fab841fSLuben Tuikov }
9891fab841fSLuben Tuikov
amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control * control)9907f599fedSStanley.Yang uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control)
991c84d4670SGuchun Chen {
9927f599fedSStanley.Yang if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
9937f599fedSStanley.Yang return RAS_MAX_RECORD_COUNT_V2_1;
9947f599fedSStanley.Yang else
995e4e6a589SLuben Tuikov return RAS_MAX_RECORD_COUNT;
996c84d4670SGuchun Chen }
99763d4c081SLuben Tuikov
998c65b0805SLuben Tuikov static ssize_t
amdgpu_ras_debugfs_eeprom_size_read(struct file * f,char __user * buf,size_t size,loff_t * pos)999c65b0805SLuben Tuikov amdgpu_ras_debugfs_eeprom_size_read(struct file *f, char __user *buf,
1000c65b0805SLuben Tuikov size_t size, loff_t *pos)
1001c65b0805SLuben Tuikov {
1002c65b0805SLuben Tuikov struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
1003c65b0805SLuben Tuikov struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1004c65b0805SLuben Tuikov struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
1005c65b0805SLuben Tuikov u8 data[50];
1006c65b0805SLuben Tuikov int res;
1007c65b0805SLuben Tuikov
1008c65b0805SLuben Tuikov if (!size)
1009c65b0805SLuben Tuikov return size;
1010c65b0805SLuben Tuikov
1011c65b0805SLuben Tuikov if (!ras || !control) {
1012c65b0805SLuben Tuikov res = snprintf(data, sizeof(data), "Not supported\n");
1013c65b0805SLuben Tuikov } else {
1014c65b0805SLuben Tuikov res = snprintf(data, sizeof(data), "%d bytes or %d records\n",
1015c65b0805SLuben Tuikov RAS_TBL_SIZE_BYTES, control->ras_max_record_count);
1016c65b0805SLuben Tuikov }
1017c65b0805SLuben Tuikov
1018c65b0805SLuben Tuikov if (*pos >= res)
1019c65b0805SLuben Tuikov return 0;
1020c65b0805SLuben Tuikov
1021c65b0805SLuben Tuikov res -= *pos;
1022c65b0805SLuben Tuikov res = min_t(size_t, res, size);
1023c65b0805SLuben Tuikov
1024c65b0805SLuben Tuikov if (copy_to_user(buf, &data[*pos], res))
102564598e23SDan Carpenter return -EFAULT;
1026c65b0805SLuben Tuikov
1027c65b0805SLuben Tuikov *pos += res;
1028c65b0805SLuben Tuikov
1029c65b0805SLuben Tuikov return res;
1030c65b0805SLuben Tuikov }
1031c65b0805SLuben Tuikov
1032c65b0805SLuben Tuikov const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = {
1033c65b0805SLuben Tuikov .owner = THIS_MODULE,
1034c65b0805SLuben Tuikov .read = amdgpu_ras_debugfs_eeprom_size_read,
1035c65b0805SLuben Tuikov .write = NULL,
1036c65b0805SLuben Tuikov .llseek = default_llseek,
1037c65b0805SLuben Tuikov };
1038c65b0805SLuben Tuikov
1039c65b0805SLuben Tuikov static const char *tbl_hdr_str = " Signature Version FirstOffs Size Checksum\n";
1040c65b0805SLuben Tuikov static const char *tbl_hdr_fmt = "0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n";
1041c65b0805SLuben Tuikov #define tbl_hdr_fmt_size (5 * (2+8) + 4 + 1)
1042c65b0805SLuben Tuikov static const char *rec_hdr_str = "Index Offset ErrType Bank/CU TimeStamp Offs/Addr MemChl MCUMCID RetiredPage\n";
1043c65b0805SLuben Tuikov static const char *rec_hdr_fmt = "%5d 0x%05X %7s 0x%02X 0x%016llX 0x%012llX 0x%02X 0x%02X 0x%012llX\n";
1044c65b0805SLuben Tuikov #define rec_hdr_fmt_size (5 + 1 + 7 + 1 + 7 + 1 + 7 + 1 + 18 + 1 + 14 + 1 + 6 + 1 + 7 + 1 + 14 + 1)
1045c65b0805SLuben Tuikov
1046c65b0805SLuben Tuikov static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = {
1047c65b0805SLuben Tuikov "ignore",
1048c65b0805SLuben Tuikov "re",
1049c65b0805SLuben Tuikov "ue",
1050c65b0805SLuben Tuikov };
1051c65b0805SLuben Tuikov
amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control * control)1052c65b0805SLuben Tuikov static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control)
1053c65b0805SLuben Tuikov {
1054c65b0805SLuben Tuikov return strlen(tbl_hdr_str) + tbl_hdr_fmt_size +
1055c65b0805SLuben Tuikov strlen(rec_hdr_str) + rec_hdr_fmt_size * control->ras_num_recs;
1056c65b0805SLuben Tuikov }
1057c65b0805SLuben Tuikov
amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control * control)1058c65b0805SLuben Tuikov void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control)
1059c65b0805SLuben Tuikov {
1060c65b0805SLuben Tuikov struct amdgpu_ras *ras = container_of(control, struct amdgpu_ras,
1061c65b0805SLuben Tuikov eeprom_control);
1062c65b0805SLuben Tuikov struct dentry *de = ras->de_ras_eeprom_table;
1063c65b0805SLuben Tuikov
1064c65b0805SLuben Tuikov if (de)
1065c65b0805SLuben Tuikov d_inode(de)->i_size = amdgpu_ras_debugfs_table_size(control);
1066c65b0805SLuben Tuikov }
1067c65b0805SLuben Tuikov
amdgpu_ras_debugfs_table_read(struct file * f,char __user * buf,size_t size,loff_t * pos)1068c65b0805SLuben Tuikov static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf,
1069c65b0805SLuben Tuikov size_t size, loff_t *pos)
1070c65b0805SLuben Tuikov {
1071c65b0805SLuben Tuikov struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
1072c65b0805SLuben Tuikov struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1073c65b0805SLuben Tuikov struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control;
1074c65b0805SLuben Tuikov const size_t orig_size = size;
1075b8badd50SDan Carpenter int res = -EFAULT;
1076c65b0805SLuben Tuikov size_t data_len;
1077c65b0805SLuben Tuikov
1078c65b0805SLuben Tuikov mutex_lock(&control->ras_tbl_mutex);
1079c65b0805SLuben Tuikov
1080c65b0805SLuben Tuikov /* We want *pos - data_len > 0, which means there's
1081c65b0805SLuben Tuikov * bytes to be printed from data.
1082c65b0805SLuben Tuikov */
1083c65b0805SLuben Tuikov data_len = strlen(tbl_hdr_str);
1084c65b0805SLuben Tuikov if (*pos < data_len) {
1085c65b0805SLuben Tuikov data_len -= *pos;
1086c65b0805SLuben Tuikov data_len = min_t(size_t, data_len, size);
1087c65b0805SLuben Tuikov if (copy_to_user(buf, &tbl_hdr_str[*pos], data_len))
1088c65b0805SLuben Tuikov goto Out;
1089c65b0805SLuben Tuikov buf += data_len;
1090c65b0805SLuben Tuikov size -= data_len;
1091c65b0805SLuben Tuikov *pos += data_len;
1092c65b0805SLuben Tuikov }
1093c65b0805SLuben Tuikov
1094c65b0805SLuben Tuikov data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size;
1095c65b0805SLuben Tuikov if (*pos < data_len && size > 0) {
1096c65b0805SLuben Tuikov u8 data[tbl_hdr_fmt_size + 1];
1097c65b0805SLuben Tuikov loff_t lpos;
1098c65b0805SLuben Tuikov
1099c65b0805SLuben Tuikov snprintf(data, sizeof(data), tbl_hdr_fmt,
1100c65b0805SLuben Tuikov control->tbl_hdr.header,
1101c65b0805SLuben Tuikov control->tbl_hdr.version,
1102c65b0805SLuben Tuikov control->tbl_hdr.first_rec_offset,
1103c65b0805SLuben Tuikov control->tbl_hdr.tbl_size,
1104c65b0805SLuben Tuikov control->tbl_hdr.checksum);
1105c65b0805SLuben Tuikov
1106c65b0805SLuben Tuikov data_len -= *pos;
1107c65b0805SLuben Tuikov data_len = min_t(size_t, data_len, size);
1108c65b0805SLuben Tuikov lpos = *pos - strlen(tbl_hdr_str);
1109c65b0805SLuben Tuikov if (copy_to_user(buf, &data[lpos], data_len))
1110c65b0805SLuben Tuikov goto Out;
1111c65b0805SLuben Tuikov buf += data_len;
1112c65b0805SLuben Tuikov size -= data_len;
1113c65b0805SLuben Tuikov *pos += data_len;
1114c65b0805SLuben Tuikov }
1115c65b0805SLuben Tuikov
1116c65b0805SLuben Tuikov data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + strlen(rec_hdr_str);
1117c65b0805SLuben Tuikov if (*pos < data_len && size > 0) {
1118c65b0805SLuben Tuikov loff_t lpos;
1119c65b0805SLuben Tuikov
1120c65b0805SLuben Tuikov data_len -= *pos;
1121c65b0805SLuben Tuikov data_len = min_t(size_t, data_len, size);
1122c65b0805SLuben Tuikov lpos = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size;
1123c65b0805SLuben Tuikov if (copy_to_user(buf, &rec_hdr_str[lpos], data_len))
1124c65b0805SLuben Tuikov goto Out;
1125c65b0805SLuben Tuikov buf += data_len;
1126c65b0805SLuben Tuikov size -= data_len;
1127c65b0805SLuben Tuikov *pos += data_len;
1128c65b0805SLuben Tuikov }
1129c65b0805SLuben Tuikov
1130c65b0805SLuben Tuikov data_len = amdgpu_ras_debugfs_table_size(control);
1131c65b0805SLuben Tuikov if (*pos < data_len && size > 0) {
1132c65b0805SLuben Tuikov u8 dare[RAS_TABLE_RECORD_SIZE];
1133c65b0805SLuben Tuikov u8 data[rec_hdr_fmt_size + 1];
1134d456f387SAlex Deucher struct eeprom_table_record record;
1135d456f387SAlex Deucher int s, r;
1136d456f387SAlex Deucher
1137c65b0805SLuben Tuikov /* Find the starting record index
1138c65b0805SLuben Tuikov */
1139d456f387SAlex Deucher s = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
1140d456f387SAlex Deucher strlen(rec_hdr_str);
1141d456f387SAlex Deucher s = s / rec_hdr_fmt_size;
1142d456f387SAlex Deucher r = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
1143d456f387SAlex Deucher strlen(rec_hdr_str);
1144d456f387SAlex Deucher r = r % rec_hdr_fmt_size;
1145c65b0805SLuben Tuikov
1146c65b0805SLuben Tuikov for ( ; size > 0 && s < control->ras_num_recs; s++) {
1147c65b0805SLuben Tuikov u32 ai = RAS_RI_TO_AI(control, s);
1148c65b0805SLuben Tuikov /* Read a single record
1149c65b0805SLuben Tuikov */
1150c65b0805SLuben Tuikov res = __amdgpu_ras_eeprom_read(control, dare, ai, 1);
1151c65b0805SLuben Tuikov if (res)
1152c65b0805SLuben Tuikov goto Out;
1153c65b0805SLuben Tuikov __decode_table_record_from_buf(control, &record, dare);
1154c65b0805SLuben Tuikov snprintf(data, sizeof(data), rec_hdr_fmt,
1155c65b0805SLuben Tuikov s,
1156c65b0805SLuben Tuikov RAS_INDEX_TO_OFFSET(control, ai),
1157c65b0805SLuben Tuikov record_err_type_str[record.err_type],
1158c65b0805SLuben Tuikov record.bank,
1159c65b0805SLuben Tuikov record.ts,
1160c65b0805SLuben Tuikov record.offset,
1161c65b0805SLuben Tuikov record.mem_channel,
1162c65b0805SLuben Tuikov record.mcumc_id,
1163c65b0805SLuben Tuikov record.retired_page);
1164c65b0805SLuben Tuikov
1165c65b0805SLuben Tuikov data_len = min_t(size_t, rec_hdr_fmt_size - r, size);
1166b8badd50SDan Carpenter if (copy_to_user(buf, &data[r], data_len)) {
1167b8badd50SDan Carpenter res = -EFAULT;
1168b8badd50SDan Carpenter goto Out;
1169b8badd50SDan Carpenter }
1170c65b0805SLuben Tuikov buf += data_len;
1171c65b0805SLuben Tuikov size -= data_len;
1172c65b0805SLuben Tuikov *pos += data_len;
1173c65b0805SLuben Tuikov r = 0;
1174c65b0805SLuben Tuikov }
1175c65b0805SLuben Tuikov }
1176c65b0805SLuben Tuikov res = 0;
1177c65b0805SLuben Tuikov Out:
1178c65b0805SLuben Tuikov mutex_unlock(&control->ras_tbl_mutex);
1179c65b0805SLuben Tuikov return res < 0 ? res : orig_size - size;
1180c65b0805SLuben Tuikov }
1181c65b0805SLuben Tuikov
1182c65b0805SLuben Tuikov static ssize_t
amdgpu_ras_debugfs_eeprom_table_read(struct file * f,char __user * buf,size_t size,loff_t * pos)1183c65b0805SLuben Tuikov amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf,
1184c65b0805SLuben Tuikov size_t size, loff_t *pos)
1185c65b0805SLuben Tuikov {
1186c65b0805SLuben Tuikov struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
1187c65b0805SLuben Tuikov struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1188c65b0805SLuben Tuikov struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
1189c65b0805SLuben Tuikov u8 data[81];
1190c65b0805SLuben Tuikov int res;
1191c65b0805SLuben Tuikov
1192c65b0805SLuben Tuikov if (!size)
1193c65b0805SLuben Tuikov return size;
1194c65b0805SLuben Tuikov
1195c65b0805SLuben Tuikov if (!ras || !control) {
1196c65b0805SLuben Tuikov res = snprintf(data, sizeof(data), "Not supported\n");
1197c65b0805SLuben Tuikov if (*pos >= res)
1198c65b0805SLuben Tuikov return 0;
1199c65b0805SLuben Tuikov
1200c65b0805SLuben Tuikov res -= *pos;
1201c65b0805SLuben Tuikov res = min_t(size_t, res, size);
1202c65b0805SLuben Tuikov
1203c65b0805SLuben Tuikov if (copy_to_user(buf, &data[*pos], res))
120464598e23SDan Carpenter return -EFAULT;
1205c65b0805SLuben Tuikov
1206c65b0805SLuben Tuikov *pos += res;
1207c65b0805SLuben Tuikov
1208c65b0805SLuben Tuikov return res;
1209c65b0805SLuben Tuikov } else {
1210c65b0805SLuben Tuikov return amdgpu_ras_debugfs_table_read(f, buf, size, pos);
1211c65b0805SLuben Tuikov }
1212c65b0805SLuben Tuikov }
1213c65b0805SLuben Tuikov
1214c65b0805SLuben Tuikov const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = {
1215c65b0805SLuben Tuikov .owner = THIS_MODULE,
1216c65b0805SLuben Tuikov .read = amdgpu_ras_debugfs_eeprom_table_read,
1217c65b0805SLuben Tuikov .write = NULL,
1218c65b0805SLuben Tuikov .llseek = default_llseek,
1219c65b0805SLuben Tuikov };
1220c65b0805SLuben Tuikov
122163d4c081SLuben Tuikov /**
122263d4c081SLuben Tuikov * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum
122363d4c081SLuben Tuikov * @control: pointer to control structure
122463d4c081SLuben Tuikov *
122563d4c081SLuben Tuikov * Check the checksum of the stored in EEPROM RAS table.
122663d4c081SLuben Tuikov *
122763d4c081SLuben Tuikov * Return 0 if the checksum is correct,
122863d4c081SLuben Tuikov * positive if it is not correct, and
122963d4c081SLuben Tuikov * -errno on I/O error.
123063d4c081SLuben Tuikov */
__verify_ras_table_checksum(struct amdgpu_ras_eeprom_control * control)123163d4c081SLuben Tuikov static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control)
123263d4c081SLuben Tuikov {
123363d4c081SLuben Tuikov struct amdgpu_device *adev = to_amdgpu_device(control);
12343006c924SDan Carpenter int buf_size, res;
123563d4c081SLuben Tuikov u8 csum, *buf, *pp;
123663d4c081SLuben Tuikov
12377f599fedSStanley.Yang if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
12387f599fedSStanley.Yang buf_size = RAS_TABLE_HEADER_SIZE +
12397f599fedSStanley.Yang RAS_TABLE_V2_1_INFO_SIZE +
12407f599fedSStanley.Yang control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
12417f599fedSStanley.Yang else
124263d4c081SLuben Tuikov buf_size = RAS_TABLE_HEADER_SIZE +
124363d4c081SLuben Tuikov control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
12447f599fedSStanley.Yang
124563d4c081SLuben Tuikov buf = kzalloc(buf_size, GFP_KERNEL);
124663d4c081SLuben Tuikov if (!buf) {
124763d4c081SLuben Tuikov DRM_ERROR("Out of memory checking RAS table checksum.\n");
124863d4c081SLuben Tuikov return -ENOMEM;
124963d4c081SLuben Tuikov }
125063d4c081SLuben Tuikov
12512f60dd50SLuben Tuikov res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
125263d4c081SLuben Tuikov control->i2c_address +
125363d4c081SLuben Tuikov control->ras_header_offset,
125463d4c081SLuben Tuikov buf, buf_size);
125563d4c081SLuben Tuikov if (res < buf_size) {
125663d4c081SLuben Tuikov DRM_ERROR("Partial read for checksum, res:%d\n", res);
125763d4c081SLuben Tuikov /* On partial reads, return -EIO.
125863d4c081SLuben Tuikov */
125963d4c081SLuben Tuikov if (res >= 0)
126063d4c081SLuben Tuikov res = -EIO;
126163d4c081SLuben Tuikov goto Out;
126263d4c081SLuben Tuikov }
126363d4c081SLuben Tuikov
126463d4c081SLuben Tuikov csum = 0;
126563d4c081SLuben Tuikov for (pp = buf; pp < buf + buf_size; pp++)
126663d4c081SLuben Tuikov csum += *pp;
126763d4c081SLuben Tuikov Out:
126863d4c081SLuben Tuikov kfree(buf);
126963d4c081SLuben Tuikov return res < 0 ? res : csum;
127063d4c081SLuben Tuikov }
127163d4c081SLuben Tuikov
__read_table_ras_info(struct amdgpu_ras_eeprom_control * control)12727f599fedSStanley.Yang static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control)
12737f599fedSStanley.Yang {
12747f599fedSStanley.Yang struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
12757f599fedSStanley.Yang struct amdgpu_device *adev = to_amdgpu_device(control);
12767f599fedSStanley.Yang unsigned char *buf;
12777f599fedSStanley.Yang int res;
12787f599fedSStanley.Yang
12797f599fedSStanley.Yang buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL);
12807f599fedSStanley.Yang if (!buf) {
12817f599fedSStanley.Yang DRM_ERROR("Failed to alloc buf to read EEPROM table ras info\n");
12827f599fedSStanley.Yang return -ENOMEM;
12837f599fedSStanley.Yang }
12847f599fedSStanley.Yang
12857f599fedSStanley.Yang /**
12867f599fedSStanley.Yang * EEPROM table V2_1 supports ras info,
12877f599fedSStanley.Yang * read EEPROM table ras info
12887f599fedSStanley.Yang */
12897f599fedSStanley.Yang res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
12907f599fedSStanley.Yang control->i2c_address + control->ras_info_offset,
12917f599fedSStanley.Yang buf, RAS_TABLE_V2_1_INFO_SIZE);
12927f599fedSStanley.Yang if (res < RAS_TABLE_V2_1_INFO_SIZE) {
12937f599fedSStanley.Yang DRM_ERROR("Failed to read EEPROM table ras info, res:%d", res);
12947f599fedSStanley.Yang res = res >= 0 ? -EIO : res;
12957f599fedSStanley.Yang goto Out;
12967f599fedSStanley.Yang }
12977f599fedSStanley.Yang
12987f599fedSStanley.Yang __decode_table_ras_info_from_buf(rai, buf);
12997f599fedSStanley.Yang
13007f599fedSStanley.Yang Out:
13017f599fedSStanley.Yang kfree(buf);
13027f599fedSStanley.Yang return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
13037f599fedSStanley.Yang }
13047f599fedSStanley.Yang
amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control * control,bool * exceed_err_limit)130563d4c081SLuben Tuikov int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
130663d4c081SLuben Tuikov bool *exceed_err_limit)
130763d4c081SLuben Tuikov {
130863d4c081SLuben Tuikov struct amdgpu_device *adev = to_amdgpu_device(control);
130963d4c081SLuben Tuikov unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
131063d4c081SLuben Tuikov struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
131163d4c081SLuben Tuikov struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
131263d4c081SLuben Tuikov int res;
131363d4c081SLuben Tuikov
131463d4c081SLuben Tuikov *exceed_err_limit = false;
131563d4c081SLuben Tuikov
131663d4c081SLuben Tuikov if (!__is_ras_eeprom_supported(adev))
131763d4c081SLuben Tuikov return 0;
131863d4c081SLuben Tuikov
131963d4c081SLuben Tuikov /* Verify i2c adapter is initialized */
13202f60dd50SLuben Tuikov if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo)
132163d4c081SLuben Tuikov return -ENOENT;
132263d4c081SLuben Tuikov
132363d4c081SLuben Tuikov if (!__get_eeprom_i2c_addr(adev, control))
132463d4c081SLuben Tuikov return -EINVAL;
132563d4c081SLuben Tuikov
132663d4c081SLuben Tuikov control->ras_header_offset = RAS_HDR_START;
13277f599fedSStanley.Yang control->ras_info_offset = RAS_TABLE_V2_1_INFO_START;
132863d4c081SLuben Tuikov mutex_init(&control->ras_tbl_mutex);
132963d4c081SLuben Tuikov
133063d4c081SLuben Tuikov /* Read the table header from EEPROM address */
13312f60dd50SLuben Tuikov res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
133263d4c081SLuben Tuikov control->i2c_address + control->ras_header_offset,
133363d4c081SLuben Tuikov buf, RAS_TABLE_HEADER_SIZE);
133463d4c081SLuben Tuikov if (res < RAS_TABLE_HEADER_SIZE) {
133563d4c081SLuben Tuikov DRM_ERROR("Failed to read EEPROM table header, res:%d", res);
133663d4c081SLuben Tuikov return res >= 0 ? -EIO : res;
133763d4c081SLuben Tuikov }
133863d4c081SLuben Tuikov
133963d4c081SLuben Tuikov __decode_table_header_from_buf(hdr, buf);
134063d4c081SLuben Tuikov
13417f599fedSStanley.Yang if (hdr->version == RAS_TABLE_VER_V2_1) {
13427f599fedSStanley.Yang control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
13437f599fedSStanley.Yang control->ras_record_offset = RAS_RECORD_START_V2_1;
13447f599fedSStanley.Yang control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
13457f599fedSStanley.Yang } else {
134663d4c081SLuben Tuikov control->ras_num_recs = RAS_NUM_RECS(hdr);
13477f599fedSStanley.Yang control->ras_record_offset = RAS_RECORD_START;
13487f599fedSStanley.Yang control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
13497f599fedSStanley.Yang }
135063d4c081SLuben Tuikov control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
135163d4c081SLuben Tuikov
135263d4c081SLuben Tuikov if (hdr->header == RAS_TABLE_HDR_VAL) {
135363d4c081SLuben Tuikov DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
135463d4c081SLuben Tuikov control->ras_num_recs);
13557f599fedSStanley.Yang
13567f599fedSStanley.Yang if (hdr->version == RAS_TABLE_VER_V2_1) {
13577f599fedSStanley.Yang res = __read_table_ras_info(control);
13587f599fedSStanley.Yang if (res)
13597f599fedSStanley.Yang return res;
13607f599fedSStanley.Yang }
13617f599fedSStanley.Yang
136263d4c081SLuben Tuikov res = __verify_ras_table_checksum(control);
136363d4c081SLuben Tuikov if (res)
136463d4c081SLuben Tuikov DRM_ERROR("RAS table incorrect checksum or error:%d\n",
136563d4c081SLuben Tuikov res);
13668483fdfeSKent Russell
13678483fdfeSKent Russell /* Warn if we are at 90% of the threshold or above
13688483fdfeSKent Russell */
13698483fdfeSKent Russell if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold)
13708483fdfeSKent Russell dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
13718483fdfeSKent Russell control->ras_num_recs,
13728483fdfeSKent Russell ras->bad_page_cnt_threshold);
137363d4c081SLuben Tuikov } else if (hdr->header == RAS_TABLE_HDR_BAD &&
137463d4c081SLuben Tuikov amdgpu_bad_page_threshold != 0) {
13757f599fedSStanley.Yang if (hdr->version == RAS_TABLE_VER_V2_1) {
13767f599fedSStanley.Yang res = __read_table_ras_info(control);
13777f599fedSStanley.Yang if (res)
13787f599fedSStanley.Yang return res;
13797f599fedSStanley.Yang }
13807f599fedSStanley.Yang
138163d4c081SLuben Tuikov res = __verify_ras_table_checksum(control);
138263d4c081SLuben Tuikov if (res)
138363d4c081SLuben Tuikov DRM_ERROR("RAS Table incorrect checksum or error:%d\n",
138463d4c081SLuben Tuikov res);
138563d4c081SLuben Tuikov if (ras->bad_page_cnt_threshold > control->ras_num_recs) {
138663d4c081SLuben Tuikov /* This means that, the threshold was increased since
138763d4c081SLuben Tuikov * the last time the system was booted, and now,
138863d4c081SLuben Tuikov * ras->bad_page_cnt_threshold - control->num_recs > 0,
138963d4c081SLuben Tuikov * so that at least one more record can be saved,
139063d4c081SLuben Tuikov * before the page count threshold is reached.
139163d4c081SLuben Tuikov */
139263d4c081SLuben Tuikov dev_info(adev->dev,
139363d4c081SLuben Tuikov "records:%d threshold:%d, resetting "
139463d4c081SLuben Tuikov "RAS table header signature",
139563d4c081SLuben Tuikov control->ras_num_recs,
139663d4c081SLuben Tuikov ras->bad_page_cnt_threshold);
139763d4c081SLuben Tuikov res = amdgpu_ras_eeprom_correct_header_tag(control,
139863d4c081SLuben Tuikov RAS_TABLE_HDR_VAL);
139963d4c081SLuben Tuikov } else {
140068daadf3SKent Russell dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
140168daadf3SKent Russell control->ras_num_recs, ras->bad_page_cnt_threshold);
1402f3cbe70eSTao Zhou if (amdgpu_bad_page_threshold == -1) {
1403f3cbe70eSTao Zhou dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
140468daadf3SKent Russell res = 0;
140568daadf3SKent Russell } else {
140663d4c081SLuben Tuikov *exceed_err_limit = true;
140763d4c081SLuben Tuikov dev_err(adev->dev,
140863d4c081SLuben Tuikov "RAS records:%d exceed threshold:%d, "
1409dcd5ea9fSKent Russell "GPU will not be initialized. Replace this GPU or increase the threshold",
141063d4c081SLuben Tuikov control->ras_num_recs, ras->bad_page_cnt_threshold);
141163d4c081SLuben Tuikov }
141268daadf3SKent Russell }
141363d4c081SLuben Tuikov } else {
141463d4c081SLuben Tuikov DRM_INFO("Creating a new EEPROM table");
141563d4c081SLuben Tuikov
141663d4c081SLuben Tuikov res = amdgpu_ras_eeprom_reset_table(control);
141763d4c081SLuben Tuikov }
141863d4c081SLuben Tuikov
141963d4c081SLuben Tuikov return res < 0 ? res : 0;
142063d4c081SLuben Tuikov }
1421