pseries: SLOF PCI flag day

Currently on the pseries machine the SLOF firmware is used normally,
but we bypass it when -kernel is specified.  Having these two

different boot paths can cause some confusion.

In particular at present we need to "probe" the (emulated) PCI bus and
produce device tree nodes for the PCI devices in qemu, for the -kernel
case.  In the SLOF case, it takes the device tree from qemu adds some
stuff to it then passes it on to the kernel.

It's been decided that a better approach is to always boot through
SLOF, even when using -kernel.  WIth this approach we can leave PCI
probing and device node creation to SLOF in all cases which removes a
bunch of code in qemu, and avoids iterating the PCI devices from the
machine specific init code which we're not supposed to do.

This patch changes qemu to always boot through SLOF, and not to create
PCI nodes.  Simultaneously it updates the included version of SLOF
(submodule and binary image) to one which supports (and requires) the
new approach.

The new SLOF version also includes a number of unrelated enhancements:
support for booting from virtio-pci devices and e1000, greatly
improved FCode support and many bugfixes.  It also makes SLOF ready to
be used even when specifying a kernel on the qemu command line.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
This commit is contained in:
Benjamin Herrenschmidt 2012-01-11 19:46:28 +00:00 committed by Alexander Graf
parent c9c3c80af7
commit 4d8d5467cd
5 changed files with 104 additions and 167 deletions

View File

@ -50,19 +50,29 @@
#include <libfdt.h>
#define KERNEL_LOAD_ADDR 0x00000000
#define INITRD_LOAD_ADDR 0x02800000
/* SLOF memory layout:
*
* SLOF raw image loaded at 0, copies its romfs right below the flat
* device-tree, then position SLOF itself 31M below that
*
* So we set FW_OVERHEAD to 40MB which should account for all of that
* and more
*
* We load our kernel at 4M, leaving space for SLOF initial image
*/
#define FDT_MAX_SIZE 0x10000
#define RTAS_MAX_SIZE 0x10000
#define FW_MAX_SIZE 0x400000
#define FW_FILE_NAME "slof.bin"
#define FW_OVERHEAD 0x2800000
#define KERNEL_LOAD_ADDR FW_MAX_SIZE
#define MIN_RMA_SLOF 128UL
#define MIN_RMA_SLOF 128UL
#define TIMEBASE_FREQ 512000000ULL
#define MAX_CPUS 256
#define XICS_IRQS 1024
#define XICS_IRQS 1024
#define SPAPR_PCI_BUID 0x800000020000001ULL
#define SPAPR_PCI_MEM_WIN_ADDR (0x10000000000ULL + 0xA0000000)
@ -139,6 +149,7 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
target_phys_addr_t rma_size,
target_phys_addr_t initrd_base,
target_phys_addr_t initrd_size,
target_phys_addr_t kernel_size,
const char *boot_device,
const char *kernel_cmdline,
long hash_shift)
@ -176,6 +187,12 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
fdt = g_malloc0(FDT_MAX_SIZE);
_FDT((fdt_create(fdt, FDT_MAX_SIZE)));
if (kernel_size) {
_FDT((fdt_add_reservemap_entry(fdt, KERNEL_LOAD_ADDR, kernel_size)));
}
if (initrd_size) {
_FDT((fdt_add_reservemap_entry(fdt, initrd_base, initrd_size)));
}
_FDT((fdt_finish_reservemap(fdt)));
/* Root node */
@ -197,15 +214,13 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
&start_prop, sizeof(start_prop))));
_FDT((fdt_property(fdt, "linux,initrd-end",
&end_prop, sizeof(end_prop))));
_FDT((fdt_property_string(fdt, "qemu,boot-device", boot_device)));
if (kernel_size) {
uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR),
cpu_to_be64(kernel_size) };
/*
* Because we don't always invoke any firmware, we can't rely on
* that to do BAR allocation. Long term, we should probably do
* that ourselves, but for now, this setting (plus advertising the
* current BARs as 0) causes sufficiently recent kernels to to the
* BAR assignment themselves */
_FDT((fdt_property_cell(fdt, "linux,pci-probe-only", 0)));
_FDT((fdt_property(fdt, "qemu,boot-kernel", &kprop, sizeof(kprop))));
}
_FDT((fdt_property_string(fdt, "qemu,boot-device", boot_device)));
_FDT((fdt_end_node(fdt)));
@ -445,6 +460,12 @@ static void spapr_finalize_fdt(sPAPREnvironment *spapr,
_FDT((fdt_pack(fdt)));
if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
hw_error("FDT too big ! 0x%x bytes (max is 0x%x)\n",
fdt_totalsize(fdt), FDT_MAX_SIZE);
exit(1);
}
cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
g_free(fdt);
@ -494,8 +515,9 @@ static void ppc_spapr_init(ram_addr_t ram_size,
MemoryRegion *sysmem = get_system_memory();
MemoryRegion *ram = g_new(MemoryRegion, 1);
target_phys_addr_t rma_alloc_size, rma_size;
uint32_t initrd_base;
long kernel_size, initrd_size, fw_size;
uint32_t initrd_base = 0;
long kernel_size = 0, initrd_size = 0;
long load_limit, rtas_limit, fw_size;
long pteg_shift = 17;
char *filename;
@ -517,11 +539,13 @@ static void ppc_spapr_init(ram_addr_t ram_size,
rma_size = ram_size;
}
/* We place the device tree just below either the top of the RMA,
/* We place the device tree and RTAS just below either the top of the RMA,
* or just below 2GB, whichever is lowere, so that it can be
* processed with 32-bit real mode code if necessary */
spapr->fdt_addr = MIN(rma_size, 0x80000000) - FDT_MAX_SIZE;
spapr->rtas_addr = spapr->fdt_addr - RTAS_MAX_SIZE;
rtas_limit = MIN(rma_size, 0x80000000);
spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
load_limit = spapr->fdt_addr - FW_OVERHEAD;
/* init CPUs */
if (cpu_model == NULL) {
@ -577,13 +601,19 @@ static void ppc_spapr_init(ram_addr_t ram_size,
filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
spapr->rtas_size = load_image_targphys(filename, spapr->rtas_addr,
ram_size - spapr->rtas_addr);
rtas_limit - spapr->rtas_addr);
if (spapr->rtas_size < 0) {
hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
exit(1);
}
if (spapr->rtas_size > RTAS_MAX_SIZE) {
hw_error("RTAS too big ! 0x%lx bytes (max is 0x%x)\n",
spapr->rtas_size, RTAS_MAX_SIZE);
exit(1);
}
g_free(filename);
/* Set up Interrupt Controller */
spapr->icp = xics_system_init(XICS_IRQS);
spapr->next_irq = 16;
@ -622,6 +652,20 @@ static void ppc_spapr_init(ram_addr_t ram_size,
spapr_vscsi_create(spapr->vio_bus, 0x2000 + i);
}
if (rma_size < (MIN_RMA_SLOF << 20)) {
fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
"%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
exit(1);
}
fprintf(stderr, "sPAPR memory map:\n");
fprintf(stderr, "RTAS : 0x%08lx..%08lx\n",
(unsigned long)spapr->rtas_addr,
(unsigned long)(spapr->rtas_addr + spapr->rtas_size - 1));
fprintf(stderr, "FDT : 0x%08lx..%08lx\n",
(unsigned long)spapr->fdt_addr,
(unsigned long)(spapr->fdt_addr + FDT_MAX_SIZE - 1));
if (kernel_filename) {
uint64_t lowaddr = 0;
@ -630,57 +674,60 @@ static void ppc_spapr_init(ram_addr_t ram_size,
if (kernel_size < 0) {
kernel_size = load_image_targphys(kernel_filename,
KERNEL_LOAD_ADDR,
ram_size - KERNEL_LOAD_ADDR);
load_limit - KERNEL_LOAD_ADDR);
}
if (kernel_size < 0) {
fprintf(stderr, "qemu: could not load kernel '%s'\n",
kernel_filename);
exit(1);
}
fprintf(stderr, "Kernel : 0x%08x..%08lx\n",
KERNEL_LOAD_ADDR, KERNEL_LOAD_ADDR + kernel_size - 1);
/* load initrd */
if (initrd_filename) {
initrd_base = INITRD_LOAD_ADDR;
/* Try to locate the initrd in the gap between the kernel
* and the firmware. Add a bit of space just in case
*/
initrd_base = (KERNEL_LOAD_ADDR + kernel_size + 0x1ffff) & ~0xffff;
initrd_size = load_image_targphys(initrd_filename, initrd_base,
ram_size - initrd_base);
load_limit - initrd_base);
if (initrd_size < 0) {
fprintf(stderr, "qemu: could not load initial ram disk '%s'\n",
initrd_filename);
exit(1);
}
fprintf(stderr, "Ramdisk : 0x%08lx..%08lx\n",
(long)initrd_base, (long)(initrd_base + initrd_size - 1));
} else {
initrd_base = 0;
initrd_size = 0;
}
}
spapr->entry_point = KERNEL_LOAD_ADDR;
} else {
if (rma_size < (MIN_RMA_SLOF << 20)) {
fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
"%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
exit(1);
}
filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, FW_FILE_NAME);
fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
if (fw_size < 0) {
hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
exit(1);
}
g_free(filename);
spapr->entry_point = 0x100;
initrd_base = 0;
initrd_size = 0;
filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, FW_FILE_NAME);
fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
if (fw_size < 0) {
hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
exit(1);
}
g_free(filename);
fprintf(stderr, "Firmware load : 0x%08x..%08lx\n",
0, fw_size);
fprintf(stderr, "Firmware runtime : 0x%08lx..%08lx\n",
load_limit, (unsigned long)spapr->fdt_addr);
/* SLOF will startup the secondary CPUs using RTAS,
rather than expecting a kexec() style entry */
for (env = first_cpu; env != NULL; env = env->next_cpu) {
env->halted = 1;
}
spapr->entry_point = 0x100;
/* SLOF will startup the secondary CPUs using RTAS */
for (env = first_cpu; env != NULL; env = env->next_cpu) {
env->halted = 1;
}
/* Prepare the device tree */
spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, rma_size,
initrd_base, initrd_size,
kernel_size,
boot_device, kernel_cmdline,
pteg_shift + 7);
assert(spapr->fdt_skel != NULL);

View File

@ -324,31 +324,13 @@ void spapr_create_phb(sPAPREnvironment *spapr,
#define b_fff(x) b_x((x), 8, 3) /* function number */
#define b_rrrrrrrr(x) b_x((x), 0, 8) /* register number */
static uint32_t regtype_to_ss(uint8_t type)
{
if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
return 3;
}
if (type == PCI_BASE_ADDRESS_SPACE_IO) {
return 1;
}
return 2;
}
int spapr_populate_pci_devices(sPAPRPHBState *phb,
uint32_t xics_phandle,
void *fdt)
{
PCIBus *bus = phb->host_state.bus;
int bus_off, node_off = 0, devid, fn, i, n, devices;
DeviceState *qdev;
int bus_off, i;
char nodename[256];
struct {
uint32_t hi;
uint64_t addr;
uint64_t size;
} __attribute__((packed)) reg[PCI_NUM_REGIONS + 1],
assigned_addresses[PCI_NUM_REGIONS];
uint32_t bus_range[] = { cpu_to_be32(0), cpu_to_be32(0xff) };
struct {
uint32_t hi;
@ -369,7 +351,7 @@ int spapr_populate_pci_devices(sPAPRPHBState *phb,
};
uint64_t bus_reg[] = { cpu_to_be64(phb->buid), 0 };
uint32_t interrupt_map_mask[] = {
cpu_to_be32(b_ddddd(-1)|b_fff(-1)), 0x0, 0x0, 0x0};
cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, 0x0};
uint32_t interrupt_map[bus->nirq][7];
/* Start populating the FDT */
@ -397,118 +379,26 @@ int spapr_populate_pci_devices(sPAPRPHBState *phb,
_FDT(fdt_setprop(fdt, bus_off, "bus-range", &bus_range, sizeof(bus_range)));
_FDT(fdt_setprop(fdt, bus_off, "ranges", &ranges, sizeof(ranges)));
_FDT(fdt_setprop(fdt, bus_off, "reg", &bus_reg, sizeof(bus_reg)));
_FDT(fdt_setprop(fdt, bus_off, "interrupt-map-mask",
&interrupt_map_mask, sizeof(interrupt_map_mask)));
_FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
/* Populate PCI devices and allocate IRQs */
devices = 0;
QTAILQ_FOREACH(qdev, &bus->qbus.children, sibling) {
PCIDevice *dev = DO_UPCAST(PCIDevice, qdev, qdev);
int irq_index = pci_spapr_map_irq(dev, 0);
uint32_t *irqmap = interrupt_map[devices];
uint8_t *config = dev->config;
devid = dev->devfn >> 3;
fn = dev->devfn & 7;
sprintf(nodename, "pci@%u,%u", devid, fn);
/* Allocate interrupt from the map */
if (devid > bus->nirq) {
printf("Unexpected behaviour in spapr_populate_pci_devices,"
"wrong devid %u\n", devid);
exit(-1);
}
irqmap[0] = cpu_to_be32(b_ddddd(devid)|b_fff(fn));
/* Build the interrupt-map, this must matches what is done
* in pci_spapr_map_irq
*/
_FDT(fdt_setprop(fdt, bus_off, "interrupt-map-mask",
&interrupt_map_mask, sizeof(interrupt_map_mask)));
for (i = 0; i < 7; i++) {
uint32_t *irqmap = interrupt_map[i];
irqmap[0] = cpu_to_be32(b_ddddd(i)|b_fff(0));
irqmap[1] = 0;
irqmap[2] = 0;
irqmap[3] = 0;
irqmap[4] = cpu_to_be32(xics_phandle);
irqmap[5] = cpu_to_be32(phb->lsi_table[irq_index].dt_irq);
irqmap[5] = cpu_to_be32(phb->lsi_table[i % SPAPR_PCI_NUM_LSI].dt_irq);
irqmap[6] = cpu_to_be32(0x8);
/* Add node to FDT */
node_off = fdt_add_subnode(fdt, bus_off, nodename);
if (node_off < 0) {
return node_off;
}
_FDT(fdt_setprop_cell(fdt, node_off, "vendor-id",
pci_get_word(&config[PCI_VENDOR_ID])));
_FDT(fdt_setprop_cell(fdt, node_off, "device-id",
pci_get_word(&config[PCI_DEVICE_ID])));
_FDT(fdt_setprop_cell(fdt, node_off, "revision-id",
pci_get_byte(&config[PCI_REVISION_ID])));
_FDT(fdt_setprop_cell(fdt, node_off, "class-code",
pci_get_long(&config[PCI_CLASS_REVISION]) >> 8));
_FDT(fdt_setprop_cell(fdt, node_off, "subsystem-id",
pci_get_word(&config[PCI_SUBSYSTEM_ID])));
_FDT(fdt_setprop_cell(fdt, node_off, "subsystem-vendor-id",
pci_get_word(&config[PCI_SUBSYSTEM_VENDOR_ID])));
/* Config space region comes first */
reg[0].hi = cpu_to_be32(
b_n(0) |
b_p(0) |
b_t(0) |
b_ss(0/*config*/) |
b_bbbbbbbb(0) |
b_ddddd(devid) |
b_fff(fn));
reg[0].addr = 0;
reg[0].size = 0;
n = 0;
for (i = 0; i < ARRAY_SIZE(bars); ++i) {
if (0 == dev->io_regions[i].size) {
continue;
}
reg[n+1].hi = cpu_to_be32(
b_n(0) |
b_p(0) |
b_t(0) |
b_ss(regtype_to_ss(dev->io_regions[i].type)) |
b_bbbbbbbb(0) |
b_ddddd(devid) |
b_fff(fn) |
b_rrrrrrrr(bars[i]));
reg[n+1].addr = 0;
reg[n+1].size = cpu_to_be64(dev->io_regions[i].size);
assigned_addresses[n].hi = cpu_to_be32(
b_n(1) |
b_p(0) |
b_t(0) |
b_ss(regtype_to_ss(dev->io_regions[i].type)) |
b_bbbbbbbb(0) |
b_ddddd(devid) |
b_fff(fn) |
b_rrrrrrrr(bars[i]));
/*
* Writing zeroes to assigned_addresses causes the guest kernel to
* reassign BARs
*/
assigned_addresses[n].addr = cpu_to_be64(dev->io_regions[i].addr);
assigned_addresses[n].size = reg[n+1].size;
++n;
}
_FDT(fdt_setprop(fdt, node_off, "reg", reg, sizeof(reg[0])*(n+1)));
_FDT(fdt_setprop(fdt, node_off, "assigned-addresses",
assigned_addresses,
sizeof(assigned_addresses[0])*(n)));
_FDT(fdt_setprop_cell(fdt, node_off, "interrupts",
pci_get_byte(&config[PCI_INTERRUPT_PIN])));
++devices;
}
/* Write interrupt map */
_FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
devices * sizeof(interrupt_map[0])));
7 * sizeof(interrupt_map[0])));
return 0;
}

View File

@ -17,7 +17,7 @@
- SLOF (Slimline Open Firmware) is a free IEEE 1275 Open Firmware
implementation for certain IBM POWER hardware. The sources are at
https://github.com/dgibson/SLOF, and the image currently in qemu is
built from git tag qemu-slof-20111013.
built from git tag qemu-slof-20120111.1.
- sgabios (the Serial Graphics Adapter option ROM) provides a means for
legacy x86 software to communicate with an attached serial console as

Binary file not shown.

@ -1 +1 @@
Subproject commit 32e3430c018ceb8413cb808477449d1968c42497
Subproject commit ab062ff3b37c39649f2b0d94ed607adc6f6b3c7d