diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index ad40e848d564..9977abff92fc 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -76,11 +76,6 @@ static enum pci_protocol_version_t pci_protocol_versions[] = { PCI_PROTOCOL_VERSION_1_1, }; -/* - * Protocol version negotiated by hv_pci_protocol_negotiation(). - */ -static enum pci_protocol_version_t pci_protocol_version; - #define PCI_CONFIG_MMIO_LENGTH 0x2000 #define CFG_PAGE_OFFSET 0x1000 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET) @@ -455,12 +450,15 @@ enum hv_pcibus_state { hv_pcibus_init = 0, hv_pcibus_probed, hv_pcibus_installed, + hv_pcibus_removing, hv_pcibus_removed, hv_pcibus_maximum }; struct hv_pcibus_device { struct pci_sysdata sysdata; + /* Protocol version negotiated with the host */ + enum pci_protocol_version_t protocol_version; enum hv_pcibus_state state; refcount_t remove_lock; struct hv_device *hdev; @@ -1224,7 +1222,7 @@ static void hv_irq_unmask(struct irq_data *data) * negative effect (yet?). */ - if (pci_protocol_version >= PCI_PROTOCOL_VERSION_1_2) { + if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) { /* * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides @@ -1394,7 +1392,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) ctxt.pci_pkt.completion_func = hv_pci_compose_compl; ctxt.pci_pkt.compl_ctxt = ∁ - switch (pci_protocol_version) { + switch (hbus->protocol_version) { case PCI_PROTOCOL_VERSION_1_1: size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, dest, @@ -1681,6 +1679,23 @@ static void prepopulate_bars(struct hv_pcibus_device *hbus) spin_lock_irqsave(&hbus->device_list_lock, flags); + /* + * Clear the memory enable bit, in case it's already set. This occurs + * in the suspend path of hibernation, where the device is suspended, + * resumed and suspended again: see hibernation_snapshot() and + * hibernation_platform_enter(). + * + * If the memory enable bit is already set, Hyper-V sliently ignores + * the below BAR updates, and the related PCI device driver can not + * work, because reading from the device register(s) always returns + * 0xFFFFFFFF. + */ + list_for_each_entry(hpdev, &hbus->children, list_entry) { + _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command); + command &= ~PCI_COMMAND_MEMORY; + _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command); + } + /* Pick addresses for the BARs. */ do { list_for_each_entry(hpdev, &hbus->children, list_entry) { @@ -2107,6 +2122,12 @@ static void hv_pci_devices_present(struct hv_pcibus_device *hbus, unsigned long flags; bool pending_dr; + if (hbus->state == hv_pcibus_removing) { + dev_info(&hbus->hdev->device, + "PCI VMBus BUS_RELATIONS: ignored\n"); + return; + } + dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT); if (!dr_wrk) return; @@ -2223,11 +2244,19 @@ static void hv_eject_device_work(struct work_struct *work) */ static void hv_pci_eject_device(struct hv_pci_dev *hpdev) { + struct hv_pcibus_device *hbus = hpdev->hbus; + struct hv_device *hdev = hbus->hdev; + + if (hbus->state == hv_pcibus_removing) { + dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n"); + return; + } + hpdev->state = hv_pcichild_ejecting; get_pcichild(hpdev); INIT_WORK(&hpdev->wrk, hv_eject_device_work); - get_hvpcibus(hpdev->hbus); - queue_work(hpdev->hbus->wq, &hpdev->wrk); + get_hvpcibus(hbus); + queue_work(hbus->wq, &hpdev->wrk); } /** @@ -2379,8 +2408,11 @@ static void hv_pci_onchannelcallback(void *context) * failing if the host doesn't support the necessary protocol * level. */ -static int hv_pci_protocol_negotiation(struct hv_device *hdev) +static int hv_pci_protocol_negotiation(struct hv_device *hdev, + enum pci_protocol_version_t version[], + int num_version) { + struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); struct pci_version_request *version_req; struct hv_pci_compl comp_pkt; struct pci_packet *pkt; @@ -2403,8 +2435,8 @@ static int hv_pci_protocol_negotiation(struct hv_device *hdev) version_req = (struct pci_version_request *)&pkt->message; version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; - for (i = 0; i < ARRAY_SIZE(pci_protocol_versions); i++) { - version_req->protocol_version = pci_protocol_versions[i]; + for (i = 0; i < num_version; i++) { + version_req->protocol_version = version[i]; ret = vmbus_sendpacket(hdev->channel, version_req, sizeof(struct pci_version_request), (unsigned long)pkt, VM_PKT_DATA_INBAND, @@ -2420,10 +2452,10 @@ static int hv_pci_protocol_negotiation(struct hv_device *hdev) } if (comp_pkt.completion_status >= 0) { - pci_protocol_version = pci_protocol_versions[i]; + hbus->protocol_version = version[i]; dev_info(&hdev->device, "PCI VMBus probing: Using version %#x\n", - pci_protocol_version); + hbus->protocol_version); goto exit; } @@ -2707,7 +2739,7 @@ static int hv_send_resources_allocated(struct hv_device *hdev) u32 wslot; int ret; - size_res = (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2) + size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) ? sizeof(*res_assigned) : sizeof(*res_assigned2); pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL); @@ -2726,7 +2758,7 @@ static int hv_send_resources_allocated(struct hv_device *hdev) pkt->completion_func = hv_pci_generic_compl; pkt->compl_ctxt = &comp_pkt; - if (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2) { + if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) { res_assigned = (struct pci_resources_assigned *)&pkt->message; res_assigned->message_type.type = @@ -2870,9 +2902,27 @@ static int hv_pci_probe(struct hv_device *hdev, * hv_pcibus_device contains the hypercall arguments for retargeting in * hv_irq_unmask(). Those must not cross a page boundary. */ - BUILD_BUG_ON(sizeof(*hbus) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE); - hbus = (struct hv_pcibus_device *)get_zeroed_page(GFP_KERNEL); + /* + * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural + * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate + * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and + * alignment of hbus is important because hbus's field + * retarget_msi_interrupt_params must not cross a 4KB page boundary. + * + * Here we prefer kzalloc to get_zeroed_page(), because a buffer + * allocated by the latter is not tracked and scanned by kmemleak, and + * hence kmemleak reports the pointer contained in the hbus buffer + * (i.e. the hpdev struct, which is created in new_pcichild_device() and + * is tracked by hbus->children) as memory leak (false positive). + * + * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be + * used to allocate the hbus buffer and we can avoid the kmemleak false + * positive by using kmemleak_alloc() and kmemleak_free() to ask + * kmemleak to track and scan the hbus buffer. + */ + hbus = (struct hv_pcibus_device *)kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); if (!hbus) return -ENOMEM; hbus->state = hv_pcibus_init; @@ -2930,7 +2980,8 @@ static int hv_pci_probe(struct hv_device *hdev, hv_set_drvdata(hdev, hbus); - ret = hv_pci_protocol_negotiation(hdev); + ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions, + ARRAY_SIZE(pci_protocol_versions)); if (ret) goto close; @@ -3011,7 +3062,7 @@ free_bus: return ret; } -static void hv_pci_bus_exit(struct hv_device *hdev) +static int hv_pci_bus_exit(struct hv_device *hdev, bool hibernating) { struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); struct { @@ -3027,16 +3078,20 @@ static void hv_pci_bus_exit(struct hv_device *hdev) * access the per-channel ringbuffer any longer. */ if (hdev->channel->rescind) - return; + return 0; - /* Delete any children which might still exist. */ - memset(&relations, 0, sizeof(relations)); - hv_pci_devices_present(hbus, &relations); + if (!hibernating) { + /* Delete any children which might still exist. */ + memset(&relations, 0, sizeof(relations)); + hv_pci_devices_present(hbus, &relations); + } ret = hv_send_resources_released(hdev); - if (ret) + if (ret) { dev_err(&hdev->device, "Couldn't send resources released packet(s)\n"); + return ret; + } memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet)); init_completion(&comp_pkt.host_event); @@ -3049,8 +3104,13 @@ static void hv_pci_bus_exit(struct hv_device *hdev) (unsigned long)&pkt.teardown_packet, VM_PKT_DATA_INBAND, VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if (!ret) - wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ); + if (ret) + return ret; + + if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) + return -ETIMEDOUT; + + return 0; } /** @@ -3062,6 +3122,7 @@ static void hv_pci_bus_exit(struct hv_device *hdev) static int hv_pci_remove(struct hv_device *hdev) { struct hv_pcibus_device *hbus; + int ret; hbus = hv_get_drvdata(hdev); if (hbus->state == hv_pcibus_installed) { @@ -3074,7 +3135,7 @@ static int hv_pci_remove(struct hv_device *hdev) hbus->state = hv_pcibus_removed; } - hv_pci_bus_exit(hdev); + ret = hv_pci_bus_exit(hdev, false); vmbus_close(hdev->channel); @@ -3090,10 +3151,97 @@ static int hv_pci_remove(struct hv_device *hdev) hv_put_dom_num(hbus->sysdata.domain); - free_page((unsigned long)hbus); + kfree(hbus); + return ret; +} + +static int hv_pci_suspend(struct hv_device *hdev) +{ + struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); + enum hv_pcibus_state old_state; + int ret; + + /* + * hv_pci_suspend() must make sure there are no pending work items + * before calling vmbus_close(), since it runs in a process context + * as a callback in dpm_suspend(). When it starts to run, the channel + * callback hv_pci_onchannelcallback(), which runs in a tasklet + * context, can be still running concurrently and scheduling new work + * items onto hbus->wq in hv_pci_devices_present() and + * hv_pci_eject_device(), and the work item handlers can access the + * vmbus channel, which can be being closed by hv_pci_suspend(), e.g. + * the work item handler pci_devices_present_work() -> + * new_pcichild_device() writes to the vmbus channel. + * + * To eliminate the race, hv_pci_suspend() disables the channel + * callback tasklet, sets hbus->state to hv_pcibus_removing, and + * re-enables the tasklet. This way, when hv_pci_suspend() proceeds, + * it knows that no new work item can be scheduled, and then it flushes + * hbus->wq and safely closes the vmbus channel. + */ + tasklet_disable(&hdev->channel->callback_event); + + /* Change the hbus state to prevent new work items. */ + old_state = hbus->state; + if (hbus->state == hv_pcibus_installed) + hbus->state = hv_pcibus_removing; + + tasklet_enable(&hdev->channel->callback_event); + + if (old_state != hv_pcibus_installed) + return -EINVAL; + + flush_workqueue(hbus->wq); + + ret = hv_pci_bus_exit(hdev, true); + if (ret) + return ret; + + vmbus_close(hdev->channel); + return 0; } +static int hv_pci_resume(struct hv_device *hdev) +{ + struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); + enum pci_protocol_version_t version[1]; + int ret; + + hbus->state = hv_pcibus_init; + + ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, + hv_pci_onchannelcallback, hbus); + if (ret) + return ret; + + /* Only use the version that was in use before hibernation. */ + version[0] = hbus->protocol_version; + ret = hv_pci_protocol_negotiation(hdev, version, 1); + if (ret) + goto out; + + ret = hv_pci_query_relations(hdev); + if (ret) + goto out; + + ret = hv_pci_enter_d0(hdev); + if (ret) + goto out; + + ret = hv_send_resources_allocated(hdev); + if (ret) + goto out; + + prepopulate_bars(hbus); + + hbus->state = hv_pcibus_installed; + return 0; +out: + vmbus_close(hdev->channel); + return ret; +} + static const struct hv_vmbus_device_id hv_pci_id_table[] = { /* PCI Pass-through Class ID */ /* 44C4F61D-4444-4400-9D52-802E27EDE19F */ @@ -3108,6 +3256,8 @@ static struct hv_driver hv_pci_drv = { .id_table = hv_pci_id_table, .probe = hv_pci_probe, .remove = hv_pci_remove, + .suspend = hv_pci_suspend, + .resume = hv_pci_resume, }; static void __exit exit_hv_pci_drv(void)