Message ID | 4F9ABCEC.9090807@redhat.com (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
On Fri, Apr 27, 2012 at 12:36:12PM -0300, Mauro Carvalho Chehab wrote: > The fix for it were in another patch[1], as calling them as "rank" is > needed also at the sysfs API. No, this doesn't fix it either: [ 10.486440] EDAC MC: DCT0 chip selects: [ 10.486443] EDAC amd64: MC: 0: 2048MB 1: 2048MB [ 10.486445] EDAC amd64: MC: 2: 2048MB 3: 2048MB [ 10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB [ 10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB [ 10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088 [ 10.486455] EDAC MC: DCT1 chip selects: [ 10.486458] EDAC amd64: MC: 0: 2048MB 1: 2048MB [ 10.486460] EDAC amd64: MC: 2: 2048MB 3: 2048MB [ 10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB [ 10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB [ 10.486467] EDAC amd64: using x8 syndromes. [ 10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100 [ 10.486472] EDAC DEBUG: amd64_dump_dramcfg_low: DIMM type: buffered; all DIMMs support ECC: yes [ 10.486475] EDAC DEBUG: amd64_dump_dramcfg_low: PAR/ERR parity: enabled [ 10.486478] EDAC DEBUG: amd64_dump_dramcfg_low: DCT 128bit mode width: 64b [ 10.486481] EDAC DEBUG: amd64_dump_dramcfg_low: x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no [ 10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding [ 10.486488] EDAC amd64: MCT channel count: 2 [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) [ 10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0 [ 10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1 [ 10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0 [ 10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1 [ 10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0 [ 10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1 [ 10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0 [ 10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1 [ 10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0 [ 10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1 [ 10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0 [ 10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1 [ 10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0 [ 10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1 [ 10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0 [ 10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1 DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total. Now your change is showing 16 ranks. Still b0rked.
Em 28-04-2012 06:05, Borislav Petkov escreveu: > On Fri, Apr 27, 2012 at 12:36:12PM -0300, Mauro Carvalho Chehab wrote: >> The fix for it were in another patch[1], as calling them as "rank" is >> needed also at the sysfs API. > > No, this doesn't fix it either: > > [ 10.486440] EDAC MC: DCT0 chip selects: > [ 10.486443] EDAC amd64: MC: 0: 2048MB 1: 2048MB > [ 10.486445] EDAC amd64: MC: 2: 2048MB 3: 2048MB > [ 10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB > [ 10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB > [ 10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088 > [ 10.486455] EDAC MC: DCT1 chip selects: > [ 10.486458] EDAC amd64: MC: 0: 2048MB 1: 2048MB > [ 10.486460] EDAC amd64: MC: 2: 2048MB 3: 2048MB > [ 10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB > [ 10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB > [ 10.486467] EDAC amd64: using x8 syndromes. > [ 10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100 > [ 10.486472] EDAC DEBUG: amd64_dump_dramcfg_low: DIMM type: buffered; all DIMMs support ECC: yes > [ 10.486475] EDAC DEBUG: amd64_dump_dramcfg_low: PAR/ERR parity: enabled > [ 10.486478] EDAC DEBUG: amd64_dump_dramcfg_low: DCT 128bit mode width: 64b > [ 10.486481] EDAC DEBUG: amd64_dump_dramcfg_low: x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no > [ 10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding > [ 10.486488] EDAC amd64: MCT channel count: 2 > [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) > [ 10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0 > [ 10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1 > [ 10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0 > [ 10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1 > [ 10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0 > [ 10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1 > [ 10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0 > [ 10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1 > [ 10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0 > [ 10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1 > [ 10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0 > [ 10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1 > [ 10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0 > [ 10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1 > [ 10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0 > [ 10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1 > > DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total. > > Now your change is showing 16 ranks. Still b0rked. > No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK. As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc doesn't know how many ranks are filled, as the driver logic first calls it to allocate for the max amount of ranks, and then fills the rank with their info (or let them untouched with 0 pages, if they're empty). Regards, Mauro
On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote: > > [ 10.486440] EDAC MC: DCT0 chip selects: > > [ 10.486443] EDAC amd64: MC: 0: 2048MB 1: 2048MB > > [ 10.486445] EDAC amd64: MC: 2: 2048MB 3: 2048MB > > [ 10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB > > [ 10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB > > [ 10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088 > > [ 10.486455] EDAC MC: DCT1 chip selects: > > [ 10.486458] EDAC amd64: MC: 0: 2048MB 1: 2048MB > > [ 10.486460] EDAC amd64: MC: 2: 2048MB 3: 2048MB > > [ 10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB > > [ 10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB > > [ 10.486467] EDAC amd64: using x8 syndromes. > > [ 10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100 > > [ 10.486472] EDAC DEBUG: amd64_dump_dramcfg_low: DIMM type: buffered; all DIMMs support ECC: yes > > [ 10.486475] EDAC DEBUG: amd64_dump_dramcfg_low: PAR/ERR parity: enabled > > [ 10.486478] EDAC DEBUG: amd64_dump_dramcfg_low: DCT 128bit mode width: 64b > > [ 10.486481] EDAC DEBUG: amd64_dump_dramcfg_low: x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no > > [ 10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding > > [ 10.486488] EDAC amd64: MCT channel count: 2 > > [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) > > [ 10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0 > > [ 10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1 > > [ 10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0 > > [ 10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1 > > [ 10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0 > > [ 10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1 > > [ 10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0 > > [ 10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1 > > [ 10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0 > > [ 10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1 > > [ 10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0 > > [ 10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1 > > [ 10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0 > > [ 10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1 > > [ 10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0 > > [ 10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1 > > > > DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total. > > > > Now your change is showing 16 ranks. Still b0rked. > > > No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK. > > As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc > doesn't know how many ranks are filled, as the driver logic first calls it to > allocate for the max amount of ranks, and then fills the rank with their info > (or let them untouched with 0 pages, if they're empty). Basically you're saying you're generating dimm_info structs for all _possible_ dimms and the loop where this debug message comes from goes and marrily initializes them all although some of them are empty: + for (i = 0; i < tot_dimms; i++) { + chan = &csi[row].channels[chn]; + dimm = EDAC_DIMM_PTR(lay, mci->dimms, n_layers, + pos[0], pos[1], pos[2]); + dimm->mci = mci; + + debugf2("%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n", __func__, + i, (dimm - mci->dimms), + pos[0], pos[1], pos[2], row, chn); + + /* Copy DIMM location */ + for (j = 0; j < n_layers; j++) + dimm->location[j] = pos[j]; ... definitely superfluous. Oh well, looking at edac_mc_alloc, it used to allocate structs for all csrows on the controller even though some of them were empty... Ok, then please remove this debug call because it is misleading. Having [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) is enough. You probably want to say how many channels/csrows there are, though: [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 8 csrows, 2 channels) or something similar. Simply dump tot_dimms, tot_channels and tot_csrows and that's it.
Em 30-04-2012 05:15, Borislav Petkov escreveu: > On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote: >>> [ 10.486440] EDAC MC: DCT0 chip selects: >>> [ 10.486443] EDAC amd64: MC: 0: 2048MB 1: 2048MB >>> [ 10.486445] EDAC amd64: MC: 2: 2048MB 3: 2048MB >>> [ 10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB >>> [ 10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB >>> [ 10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088 >>> [ 10.486455] EDAC MC: DCT1 chip selects: >>> [ 10.486458] EDAC amd64: MC: 0: 2048MB 1: 2048MB >>> [ 10.486460] EDAC amd64: MC: 2: 2048MB 3: 2048MB >>> [ 10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB >>> [ 10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB >>> [ 10.486467] EDAC amd64: using x8 syndromes. >>> [ 10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100 >>> [ 10.486472] EDAC DEBUG: amd64_dump_dramcfg_low: DIMM type: buffered; all DIMMs support ECC: yes >>> [ 10.486475] EDAC DEBUG: amd64_dump_dramcfg_low: PAR/ERR parity: enabled >>> [ 10.486478] EDAC DEBUG: amd64_dump_dramcfg_low: DCT 128bit mode width: 64b >>> [ 10.486481] EDAC DEBUG: amd64_dump_dramcfg_low: x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no >>> [ 10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding >>> [ 10.486488] EDAC amd64: MCT channel count: 2 >>> [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) >>> [ 10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0 >>> [ 10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1 >>> [ 10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0 >>> [ 10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1 >>> [ 10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0 >>> [ 10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1 >>> [ 10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0 >>> [ 10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1 >>> [ 10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0 >>> [ 10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1 >>> [ 10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0 >>> [ 10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1 >>> [ 10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0 >>> [ 10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1 >>> [ 10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0 >>> [ 10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1 >>> >>> DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total. >>> >>> Now your change is showing 16 ranks. Still b0rked. >>> >> No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK. >> >> As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc >> doesn't know how many ranks are filled, as the driver logic first calls it to >> allocate for the max amount of ranks, and then fills the rank with their info >> (or let them untouched with 0 pages, if they're empty). > > Basically you're saying you're generating dimm_info structs for all > _possible_ dimms and the loop where this debug message comes from goes > and marrily initializes them all although some of them are empty: > > + for (i = 0; i < tot_dimms; i++) { > + chan = &csi[row].channels[chn]; > + dimm = EDAC_DIMM_PTR(lay, mci->dimms, n_layers, > + pos[0], pos[1], pos[2]); > + dimm->mci = mci; > + > + debugf2("%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n", __func__, > + i, (dimm - mci->dimms), > + pos[0], pos[1], pos[2], row, chn); > + > + /* Copy DIMM location */ > + for (j = 0; j < n_layers; j++) > + dimm->location[j] = pos[j]; > ... > > definitely superfluous. > > Oh well, looking at edac_mc_alloc, it used to allocate structs for all > csrows on the controller even though some of them were empty... > > Ok, then please remove this debug call because it is misleading. Having > > [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) > > is enough. > > You probably want to say how many channels/csrows there are, though: > > [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 8 csrows, 2 channels) > > or something similar. Simply dump tot_dimms, tot_channels and tot_csrows > and that's it. > It seems you have a very short memory. We had a similar discussion about that a while ago: https://lkml.org/lkml/2012/3/8/440 See my comments at: https://lkml.org/lkml/2012/3/9/101 https://lkml.org/lkml/2012/3/9/267 As it was explained there, those debug messages provide a map between the legacy per-csrow data, used by the old API and the dimm_info representation. For a per-csrow memory controller, the map is trivial, as the memory location will match the csrow/channel information, but for modern memory controllers, the map info is not trivial and it helps to check what it is expected to be found when retrieving information via the legacy EDAC API. For example, this is the mapping used by the second memory controller of the SB machine I'm using on my tests: [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2) ... [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels) [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0 [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1 [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2 [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3 [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0 [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1 [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2 [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3 [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0 [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1 [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2 [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3 With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is called "dimm11" at the new API, and corresponds to "csrow 2, channel 3" for a legacy EDAC API call. Regards, Mauro
On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote: > It seems you have a very short memory. Oh, puh-lease, let's don't start with the insults now. You're not a saint yourself. And maybe the fact that I'm having hard time grasping your code is maybe because it is a load of crap and you seem to generate a lot of senseless drivel when explaining what it does. And don't get me started on the patch bombs. So, let's stay constructive here before I, as the last and only one person reviewing this stinking pile stops messing with it (I got other stuff to do, you know) and NACK it completely. > For example, this is the mapping used by the second memory controller of the SB machine > I'm using on my tests: > > [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2) > ... > [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels) > [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms > [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0 > [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1 > [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2 > [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3 > [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0 > [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1 > [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2 > [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3 > [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0 > [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1 > [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2 > [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3 > > With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is > called "dimm11" at the new API, and corresponds to "csrow 2, channel 3" for a legacy > EDAC API call. Are all those DIMM slots above populated? What happens if they're not, are you issuing the same dimm0-dimm11 lines for slots which aren't even populated? I have a much better idea: Generally, this debug info should come from the specific driver that allocates the dimm descriptors, not from the EDAC core. This way, you know in the driver which slots are populated and those which are not should be omitted. This way it says "initializing 12 dimms" and the user thinks there are 12 DIMMs on his system where this might not be true.
Em 30-04-2012 05:15, Borislav Petkov escreveu: > On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote: >>> [ 10.486440] EDAC MC: DCT0 chip selects: >>> [ 10.486443] EDAC amd64: MC: 0: 2048MB 1: 2048MB >>> [ 10.486445] EDAC amd64: MC: 2: 2048MB 3: 2048MB >>> [ 10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB >>> [ 10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB >>> [ 10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088 >>> [ 10.486455] EDAC MC: DCT1 chip selects: >>> [ 10.486458] EDAC amd64: MC: 0: 2048MB 1: 2048MB >>> [ 10.486460] EDAC amd64: MC: 2: 2048MB 3: 2048MB >>> [ 10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB >>> [ 10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB >>> [ 10.486467] EDAC amd64: using x8 syndromes. >>> [ 10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100 >>> [ 10.486472] EDAC DEBUG: amd64_dump_dramcfg_low: DIMM type: buffered; all DIMMs support ECC: yes >>> [ 10.486475] EDAC DEBUG: amd64_dump_dramcfg_low: PAR/ERR parity: enabled >>> [ 10.486478] EDAC DEBUG: amd64_dump_dramcfg_low: DCT 128bit mode width: 64b >>> [ 10.486481] EDAC DEBUG: amd64_dump_dramcfg_low: x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no >>> [ 10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding >>> [ 10.486488] EDAC amd64: MCT channel count: 2 >>> [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) >>> [ 10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0 >>> [ 10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1 >>> [ 10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0 >>> [ 10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1 >>> [ 10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0 >>> [ 10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1 >>> [ 10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0 >>> [ 10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1 >>> [ 10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0 >>> [ 10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1 >>> [ 10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0 >>> [ 10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1 >>> [ 10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0 >>> [ 10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1 >>> [ 10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0 >>> [ 10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1 >>> >>> DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total. >>> >>> Now your change is showing 16 ranks. Still b0rked. >>> >> No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK. >> >> As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc >> doesn't know how many ranks are filled, as the driver logic first calls it to >> allocate for the max amount of ranks, and then fills the rank with their info >> (or let them untouched with 0 pages, if they're empty). > > Basically you're saying you're generating dimm_info structs for all > _possible_ dimms and the loop where this debug message comes from goes > and marrily initializes them all although some of them are empty: > > + for (i = 0; i < tot_dimms; i++) { > + chan = &csi[row].channels[chn]; > + dimm = EDAC_DIMM_PTR(lay, mci->dimms, n_layers, > + pos[0], pos[1], pos[2]); > + dimm->mci = mci; > + > + debugf2("%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n", __func__, > + i, (dimm - mci->dimms), > + pos[0], pos[1], pos[2], row, chn); > + > + /* Copy DIMM location */ > + for (j = 0; j < n_layers; j++) > + dimm->location[j] = pos[j]; > ... > > definitely superfluous. This is the way the EDAC core works: everything is allocated, on one shot, when this function is called, and, on most drivers, before the code that probes how many DIMMS/ranks got initialized. That happens because the edac_mc_alloc() arguments provide the total amount of ranks/dimms, but doesn't say anything about what is used there. Changing from this model to another model that would dynamically initialize the per-dimm/rank data is possible, but that would require another set of patches that will touch on all drivers, and to convert the edac_mc_alloc function into 3 or 4 function calls, with the corresponding changes on all drivers. Also, the changes at the drivers won't likely be trivial. The patches that convert kobj into "struct device" does part of the job, as, after it, each dimm/csrow will be allocated by a separate kmalloc. After having this series fully applied, it would be possible to work on such solution. I'll eventually do that, as this would simplify the code at i7core_edac and sb_edac. Regards, Mauro
Em 30-04-2012 08:11, Borislav Petkov escreveu: > On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote: >> For example, this is the mapping used by the second memory controller of the SB machine >> I'm using on my tests: >> >> [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2) >> ... >> [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels) >> [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms >> [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0 >> [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1 >> [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2 >> [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3 >> [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0 >> [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1 >> [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2 >> [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3 >> [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0 >> [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1 >> [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2 >> [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3 >> >> With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is >> called "dimm11" at the new API, and corresponds to "csrow 2, channel 3" for a legacy >> EDAC API call. > > Are all those DIMM slots above populated? What happens if they're not, > are you issuing the same dimm0-dimm11 lines for slots which aren't even > populated? > > I have a much better idea: Generally, this debug info should come from > the specific driver that allocates the dimm descriptors, not from the > EDAC core. This way, you know in the driver which slots are populated > and those which are not should be omitted. The drivers don't allocate the dimm descriptors. They're allocated by the core. > This way it says "initializing 12 dimms" and the user thinks there are > 12 DIMMs on his system where this might not be true. I'm OK to remove the "initializing 12 dimms" message. It doesn't add anything new. With regards do the other messages, if the debug messages are not clear, then let's fix them, instead of removing. What if we print, instead, on a message like: "row 1, chan 1 will represent dimm5 (1:2:0) if not empty" Regards, Mauro
On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote: > Em 30-04-2012 08:11, Borislav Petkov escreveu: > > On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote: > > >> For example, this is the mapping used by the second memory controller of the SB machine > >> I'm using on my tests: > >> > >> [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2) > >> ... > >> [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels) > >> [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms > >> [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0 > >> [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1 > >> [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2 > >> [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3 > >> [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0 > >> [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1 > >> [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2 > >> [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3 > >> [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0 > >> [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1 > >> [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2 > >> [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3 > >> > >> With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is > >> called "dimm11" at the new API, and corresponds to "csrow 2, channel 3" for a legacy > >> EDAC API call. > > > > Are all those DIMM slots above populated? What happens if they're not, > > are you issuing the same dimm0-dimm11 lines for slots which aren't even > > populated? > > > > I have a much better idea: Generally, this debug info should come from > > the specific driver that allocates the dimm descriptors, not from the > > EDAC core. This way, you know in the driver which slots are populated > > and those which are not should be omitted. > > The drivers don't allocate the dimm descriptors. They're allocated by the > core. I know that. The drivers call into EDAC core using edac_mc_alloc, this is what I meant above. > > This way it says "initializing 12 dimms" and the user thinks there are > > 12 DIMMs on his system where this might not be true. > > > I'm OK to remove the "initializing 12 dimms" message. It doesn't add anything > new. > > With regards do the other messages, if the debug messages are not clear, > then let's fix them, instead of removing. What if we print, instead, > on a message like: > > "row 1, chan 1 will represent dimm5 (1:2:0) if not empty" How about the following instead: the specific driver calls edac_mc_alloc(), it gets the allocated dimm array in mci->dimms _without_ dumping each dimm%d line. Then, each driver figures out which subset of that dimms array actually has populated slots and prints only the populated rank/slot/... This information is much more valuable than saying how many _possible_ slots the edac core has allocated. Then, each driver can decide whether it makes sense to dump that info or not.
Em 30-04-2012 09:38, Borislav Petkov escreveu: > On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote: >> Em 30-04-2012 08:11, Borislav Petkov escreveu: >>> On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote: >>> This way it says "initializing 12 dimms" and the user thinks there are >>> 12 DIMMs on his system where this might not be true. >> >> >> I'm OK to remove the "initializing 12 dimms" message. It doesn't add anything >> new. >> >> With regards do the other messages, if the debug messages are not clear, >> then let's fix them, instead of removing. What if we print, instead, >> on a message like: >> >> "row 1, chan 1 will represent dimm5 (1:2:0) if not empty" > > How about the following instead: the specific driver calls > edac_mc_alloc(), it gets the allocated dimm array in mci->dimms > _without_ dumping each dimm%d line. Then, each driver figures out which > subset of that dimms array actually has populated slots and prints only > the populated rank/slot/... > > This information is much more valuable than saying how many _possible_ > slots the edac core has allocated. > > Then, each driver can decide whether it makes sense to dump that info or > not. No, that would add extra complexity at the drivers level just due to debug messages. I think that the better is to move this printk to the debug-specific routine that is called only when the dimm is filled (edac_mc_dump_dimm). With this cange, the message will be printed only for the filled dimms. This is a cleanup patch, so I'll write it, together with the change that will get rid of the loop that uses KERN_CONT. It will use a function added by a latter patch at edac_mc_sysfs so it can't be merged on this patch anyway. Regards, Mauro
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 4d4d8b7..e0d9481 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -86,7 +86,7 @@ static void edac_mc_dump_mci(struct mem_ctl_info *mci) debugf4("\tmci->edac_check = %p\n", mci->edac_check); debugf3("\tmci->nr_csrows = %d, csrows = %p\n", mci->nr_csrows, mci->csrows); - debugf3("\tmci->nr_dimms = %d, dimns = %p\n", + debugf3("\tmci->nr_dimms = %d, dimms = %p\n", mci->tot_dimms, mci->dimms); debugf3("\tdev = %p\n", mci->dev); debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name); @@ -183,10 +183,6 @@ void *edac_align_ptr(void **p, unsigned size, int n_elems) * @size_pvt: size of private storage needed * * - * FIXME: drivers handle multi-rank memories on different ways: on some - * drivers, one multi-rank memory is mapped as one DIMM, while, on others, - * a single multi-rank DIMM would be mapped into several "dimms". - * * Non-csrow based drivers (like FB-DIMM and RAMBUS ones) will likely report * such DIMMS properly, but the CSROWS-based ones will likely do the wrong * thing, as two chip select values are used for dual-rank memories (and 4, for @@ -201,6 +197,12 @@ void *edac_align_ptr(void **p, unsigned size, int n_elems) * * Use edac_mc_free() to free mc structures allocated by this function. * + * NOTE: drivers handle multi-rank memories on different ways: on some + * drivers, one multi-rank memory is mapped as one entry, while, on others, + * a single multi-rank DIMM would be mapped into several entries. Currently, + * this function will allocate multiple struct dimm_info on such scenarios, + * as grouping the multiple ranks require drivers change. + * * Returns: * NULL allocation failed * struct mem_ctl_info pointer @@ -220,10 +222,11 @@ struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; void *pvt; unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS]; - unsigned tot_csrows, tot_cschannels; + unsigned tot_csrows, tot_cschannels, tot_errcount = 0; int i, j; int err; int row, chn; + bool per_rank = false; BUG_ON(n_layers > EDAC_MAX_LAYERS); /* @@ -239,6 +242,9 @@ struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, tot_csrows *= layers[i].size; else tot_cschannels *= layers[i].size; + + if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT) + per_rank = true; } /* Figure out the offsets of the various items from the start of an mc @@ -254,14 +260,21 @@ struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, count = 1; for (i = 0; i < n_layers; i++) { count *= layers[i].size; + debugf4("%s: errcount layer %d size %d\n", __func__, i, count); ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); + tot_errcount += 2 * count; } + + debugf4("%s: allocating %d error counters\n", __func__, tot_errcount); pvt = edac_align_ptr(&ptr, sz_pvt, 1); size = ((unsigned long)pvt) + sz_pvt; - debugf1("%s(): allocating %u bytes for mci data (%d dimms, %d csrows/channels)\n", - __func__, size, tot_dimms, tot_csrows * tot_cschannels); + debugf1("%s(): allocating %u bytes for mci data (%d %s, %d csrows/channels)\n", + __func__, size, + tot_dimms, + per_rank ? "ranks" : "dimms", + tot_csrows * tot_cschannels); mci = kzalloc(size, GFP_KERNEL); if (mci == NULL) return NULL; @@ -290,6 +303,7 @@ struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, memcpy(mci->layers, layers, sizeof(*lay) * n_layers); mci->nr_csrows = tot_csrows; mci->num_cschannel = tot_cschannels; + mci->mem_is_per_rank = per_rank; /* * Fills the csrow struct @@ -315,15 +329,16 @@ struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, memset(&pos, 0, sizeof(pos)); row = 0; chn = 0; - debugf4("%s: initializing %d dimms\n", __func__, tot_dimms); + debugf4("%s: initializing %d %s\n", __func__, tot_dimms, + per_rank ? "ranks" : "dimms"); for (i = 0; i < tot_dimms; i++) { chan = &csi[row].channels[chn]; dimm = EDAC_DIMM_PTR(lay, mci->dimms, n_layers, pos[0], pos[1], pos[2]); dimm->mci = mci; - debugf2("%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n", __func__, - i, (dimm - mci->dimms), + debugf2("%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n", __func__, + i, per_rank ? "rank" : "dimm", (dimm - mci->dimms), pos[0], pos[1], pos[2], row, chn); /* Copy DIMM location */ @@ -1040,8 +1055,10 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, * get csrow/channel of the dimm, in order to allow * incrementing the compat API counters */ - debugf4("%s: dimm csrows (%d,%d)\n", - __func__, dimm->csrow, dimm->cschannel); + debugf4("%s: %s csrows map: (%d,%d)\n", + __func__, + mci->mem_is_per_rank ? "rank" : "dimm", + dimm->csrow, dimm->cschannel); if (row == -1) row = dimm->csrow; else if (row >= 0 && row != dimm->csrow) diff --git a/include/linux/edac.h b/include/linux/edac.h index 412d5cd..2b66109 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -555,6 +555,8 @@ struct mem_ctl_info { /* Memory Controller hierarchy */ unsigned n_layers; struct edac_mc_layer *layers; + bool mem_is_per_rank; + /* * DIMM info. Will eventually remove the entire csrows_info some day */