本文共 5979 字,大约阅读时间需要 19 分钟。
SCSI有两种常见的故障类型。
一种是卡自身故障发出错误中断或者中断里面自带错误信息; 另外一种是卡没有响应,丢中断触发定时器超时错误。对于第一种故障模型:
硬件中断执行结束后会触发软中断,流程如下void blk_done_softirq(struct softirq_action *h){ struct list_head *cpu_list, local_list; local_irq_disable(); cpu_list = &__get_cpu_var(blk_cpu_done); list_replace_init(cpu_list, &local_list); local_irq_enable(); while (!list_empty(&local_list)) {//遍历链表,执行钩子函数 struct request *rq; rq = list_entry(local_list.next, struct request, csd.list); list_del_init(&rq->csd.list); rq->q->softirq_done_fn(rq); =>void scsi_softirq_done(struct request *rq) { /*解析底层控制器中断的处理结果,对于USB控制器,是由usb_stor_invoke_transport完成 *错误一般是重试,走NEEDS_RETRY分支,最多重试5次,超过5次走default分支 */ disposition = scsi_decide_disposition(cmd); switch (disposition) { case SUCCESS: scsi_finish_command(cmd); break; case NEEDS_RETRY: scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY); break; case ADD_TO_MLQUEUE: scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY); break; default: ret = !scsi_eh_scmd_add(cmd, 0); =>int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag) { struct Scsi_Host *shost = scmd->device->host; unsigned long flags; int ret = 0; if (!shost->ehandler) return 0; spin_lock_irqsave(shost->host_lock, flags); if (scsi_host_set_state(shost, SHOST_RECOVERY)) if (scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY)) goto out_unlock; ret = 1; scmd->eh_eflags |= eh_flag; list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q); shost->host_failed++; scsi_eh_wakeup(shost);//唤醒异常处理线程 void scsi_eh_wakeup(struct Scsi_Host *shost)//内容太多,详见下面 out_unlock: spin_unlock_irqrestore(shost->host_lock, flags); return ret; } if (ret) scsi_finish_command(cmd); } } } }
错误处理线程流程如下:
void scsi_eh_wakeup(struct Scsi_Host *shost){ if (shost->host_busy == shost->host_failed) wake_up_process(shost->ehandler); =>int scsi_error_handler(void *data) { struct Scsi_Host *shost = data; /* * We use TASK_INTERRUPTIBLE so that the thread is not * counted against the load average as a running process. * We never actually get interrupted because kthread_run * disables signal delivery for the created thread. */ set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0) || shost->host_failed != shost->host_busy) { schedule(); set_current_state(TASK_INTERRUPTIBLE); continue; } __set_current_state(TASK_RUNNING); /* * We have a host that is failing for some reason. Figure out * what we need to do to get it up and online again (if we can). * If we fail, we end up taking the thing offline. */ if (shost->transportt->eh_strategy_handler)//如果有自定义的钩子函数则执行自定义钩子函数 shost->transportt->eh_strategy_handler(shost); else scsi_unjam_host(shost);//系统默认钩子函数 =>void scsi_unjam_host(struct Scsi_Host *shost) { unsigned long flags; LIST_HEAD(eh_work_q); LIST_HEAD(eh_done_q); spin_lock_irqsave(shost->host_lock, flags); list_splice_init(&shost->eh_cmd_q, &eh_work_q); spin_unlock_irqrestore(shost->host_lock, flags); SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q)); if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q)) if (!scsi_eh_abort_cmds(&eh_work_q, &eh_done_q)) /*Scsiglue.c (drivers\usb\storage): .eh_abort_handler = command_abort,*/ =>int command_abort(struct scsi_cmnd *srb) { set_bit(US_FLIDX_TIMED_OUT, &us->dflags);//定时器唤醒 if (!test_bit(US_FLIDX_RESETTING, &us->dflags)) { set_bit(US_FLIDX_ABORTING, &us->dflags); usb_stor_stop_transport(us); /* Stop the current URB transfer */ =>void usb_stor_stop_transport(struct us_data *us) { if (test_and_clear_bit(US_FLIDX_URB_ACTIVE, &us->dflags)) { US_DEBUGP("-- cancelling URB\n"); usb_unlink_urb(us->current_urb); =>int usb_unlink_urb(struct urb *urb) { return usb_hcd_unlink_urb(urb, -ECONNRESET); =>int usb_hcd_unlink_urb (struct urb *urb, int status) { retval = unlink1(hcd, urb, status); =>int unlink1(struct usb_hcd *hcd, struct urb *urb, int status) { value = usb_rh_urb_dequeue(hcd, urb, status); =>int usb_rh_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) { usb_hcd_giveback_urb(hcd, urb, status); =>void usb_hcd_giveback_urb(struct usb_hcd *hcd, struct urb *urb, int status) { urb->status = status; urb->complete (urb); } } } } } } } } } scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q); =>void scsi_eh_ready_devs(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { if (!scsi_eh_stu(shost, work_q, done_q))//逐级从轻到重复位 if (!scsi_eh_bus_device_reset(shost, work_q, done_q)) if (!scsi_eh_target_reset(shost, work_q, done_q)) if (!scsi_eh_bus_reset(shost, work_q, done_q)) if (!scsi_eh_host_reset(work_q, done_q)) /*搞不定则将其踢出去*/ scsi_eh_offline_sdevs(work_q, done_q); } scsi_eh_flush_done_q(&eh_done_q); } /* * Note - if the above fails completely, the action is to take * individual devices offline and flush the queue of any * outstanding requests that may have been pending. When we * restart, we restart any I/O to any other devices on the bus * which are still online. */ scsi_restart_operations(shost); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); shost->ehandler = NULL; return 0; }}
转载地址:http://ztlji.baihongyu.com/