diff mbox

2.6.31-rc5 regression: hd don't show up

Message ID 4A96460F.3020600@kernel.org
State Not Applicable
Delegated to: David Miller
Headers show

Commit Message

Tejun Heo Aug. 27, 2009, 8:38 a.m. UTC
Tim Blechmann wrote:
>>>>> running 2.6.31-rc5 (7cb7beb31aa3d941833b6a6e553687422c31e4b6 to be
>>>>> exact), sometimes some hard disks don't show up.
>>>>>
>>>>> after booting, my root hd (sda) is mounted to /, while two other hds
>>>>> (sdb/sdc) are mounted as a user. sda is always present, but the other
>>>>> two sometimes don't show up (i.e. they are not listed in /dev/disk/, nor
>>>>> to they have a /dev/sdX link). with 2.6.29 and 2.6.30, all three disks
>>>>> are reported correctly.
>>>> Can you please attach boot logs of a successful and a failed boot?
>>> i have two files attached:
>>> - dmesg_good - all hds are available
>>> - dmesg_bad - on hd is missing
>> Can you please apply the attached patch and post the bad boot log?
> 
> attached you find boot logs for both a good ad a bad boot

Sorry about the long delay.  I somehow marked the message read without
actually reading it.

I suspected the problem was with getting the wrong classification code
or phantom device detection kicking in spuriously.  Looks like the
problem happens way before that.  Can you please apply the attached
patch and report the result?

Thanks.

Comments

Tim Blechmann Aug. 27, 2009, 10:21 a.m. UTC | #1
On 08/27/2009 10:38 AM, Tejun Heo wrote:
> Tim Blechmann wrote:
>>>>>> running 2.6.31-rc5 (7cb7beb31aa3d941833b6a6e553687422c31e4b6 to be
>>>>>> exact), sometimes some hard disks don't show up.
>>>>>>
>>>>>> after booting, my root hd (sda) is mounted to /, while two other hds
>>>>>> (sdb/sdc) are mounted as a user. sda is always present, but the other
>>>>>> two sometimes don't show up (i.e. they are not listed in /dev/disk/, nor
>>>>>> to they have a /dev/sdX link). with 2.6.29 and 2.6.30, all three disks
>>>>>> are reported correctly.
>>>>> Can you please attach boot logs of a successful and a failed boot?
>>>> i have two files attached:
>>>> - dmesg_good - all hds are available
>>>> - dmesg_bad - on hd is missing
>>> Can you please apply the attached patch and post the bad boot log?
>>
>> attached you find boot logs for both a good ad a bad boot
> 
> Sorry about the long delay.  I somehow marked the message read without
> actually reading it.
> 
> I suspected the problem was with getting the wrong classification code
> or phantom device detection kicking in spuriously.  Looks like the
> problem happens way before that.  Can you please apply the attached
> patch and report the result?

i applied your patch onto of the current linus/master branch and
currently (after rebooting 5 or 6 times) i cannot reproduce the problem
any more ...
however, there is a warning stack trace in the boot log from libata code
(bootlog attached)

tim
diff mbox

Patch

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 072ba5e..df099b7 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -3770,6 +3770,7 @@  int sata_link_resume(struct ata_link *link, const unsigned long *params,
 
 	scontrol = (scontrol & 0x0f0) | 0x300;
 
+	ata_link_printk(link, KERN_INFO, "XXX bringing up link\n");
 	if ((rc = sata_scr_write(link, SCR_CONTROL, scontrol)))
 		return rc;
 
@@ -3778,7 +3779,9 @@  int sata_link_resume(struct ata_link *link, const unsigned long *params,
 	 */
 	msleep(200);
 
-	if ((rc = sata_link_debounce(link, params, deadline)))
+	rc = sata_link_debounce(link, params, deadline);
+	ata_link_printk(link, KERN_INFO, "XXX debounced rc=%d\n", rc);
+	if (rc)
 		return rc;
 
 	/* clear SError, some PHYs require this even for SRST to work */
@@ -3904,8 +3907,13 @@  int sata_link_hardreset(struct ata_link *link, const unsigned long *timing,
 	if (rc)
 		goto out;
 	/* if link is offline nothing more to do */
-	if (ata_phys_link_offline(link))
+	if (ata_phys_link_offline(link)) {
+		ata_link_printk(link, KERN_INFO,
+				"XXX phys link offline, l=%d p=%d\n",
+				ata_link_offline(link),
+				ata_phys_link_offline(link));
 		goto out;
+	}
 
 	/* Link is online.  From this point, -ENODEV too is an error. */
 	if (online)
@@ -6060,7 +6068,7 @@  static void async_port_probe(void *data, async_cookie_t cookie)
 
 		ehi->probe_mask |= ATA_ALL_DEVICES;
 		ehi->action |= ATA_EH_RESET | ATA_EH_LPM;
-		ehi->flags |= ATA_EHI_NO_AUTOPSY | ATA_EHI_QUIET;
+		ehi->flags |= ATA_EHI_NO_AUTOPSY/* | ATA_EHI_QUIET*/;
 
 		ap->pflags &= ~ATA_PFLAG_INITIALIZING;
 		ap->pflags |= ATA_PFLAG_LOADING;
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index bbbb1fa..c718d12 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -1998,6 +1998,9 @@  unsigned int ata_sff_dev_classify(struct ata_device *dev, int present,
 	if (r_err)
 		*r_err = err;
 
+	ata_dev_printk(dev, KERN_INFO, "XXX CLASSIFY TF %02x/%02x:%02x:%02x:%02x\n",
+		       tf.command, tf.feature, tf.lbal, tf.lbam, tf.lbah);
+
 	/* see if device passed diags: continue and warn later */
 	if (err == 0)
 		/* diagnostic fail : do nothing _YET_ */
@@ -2006,11 +2009,14 @@  unsigned int ata_sff_dev_classify(struct ata_device *dev, int present,
 		/* do nothing */ ;
 	else if ((dev->devno == 0) && (err == 0x81))
 		/* do nothing */ ;
-	else
+	else {
+		ata_dev_printk(dev, KERN_INFO, "XXX diag nodev\n");
 		return ATA_DEV_NONE;
+	}
 
 	/* determine if device is ATA or ATAPI */
 	class = ata_dev_classify(&tf);
+	ata_dev_printk(dev, KERN_INFO, "XXX ata_dev_classify=%d\n", class);
 
 	if (class == ATA_DEV_UNKNOWN) {
 		/* If the device failed diagnostic, it's likely to
@@ -2019,13 +2025,18 @@  unsigned int ata_sff_dev_classify(struct ata_device *dev, int present,
 		 * device signature is invalid with diagnostic
 		 * failure.
 		 */
-		if (present && (dev->horkage & ATA_HORKAGE_DIAGNOSTIC))
+		if (present && (dev->horkage & ATA_HORKAGE_DIAGNOSTIC)) {
+			ata_dev_printk(dev, KERN_INFO, "XXX UNK && present -> ATA\n");
 			class = ATA_DEV_ATA;
-		else
+		} else {
 			class = ATA_DEV_NONE;
+			ata_dev_printk(dev, KERN_INFO, "XXX UNK && !present -> NONE\n");
+		}
 	} else if ((class == ATA_DEV_ATA) &&
-		   (ap->ops->sff_check_status(ap) == 0))
+		   (ap->ops->sff_check_status(ap) == 0)) {
 		class = ATA_DEV_NONE;
+		ata_dev_printk(dev, KERN_INFO, "XXX stat==0 -> NONE\n");
+	}
 
 	return class;
 }