|
View:
New views
20 Messages
—
Rating Filter:
Alert me
|
| < Prev | 1 - 2 | Next > |
|
|
[PATCH 0/10] x86: handle HW IOMMU initialization failure gracelyIf HW IOMMU initialization fails (Intel VT-d often does typically due
to BIOS bugs), we fall back to nommu. It doesn't work for the majority since nowadays we have more than 4GB memory so we must use swiotlb instead of nommu. The problem is that it's too late to initialize swiotlb when HW IOMMU initialization fails. We need to allocate swiotlb memory earlier from bootmem allocator. Chris explained the issue in detail: http://marc.info/?l=linux-kernel&m=125657444317079&w=2 The current x86 IOMMU initialization sequence is too complicated and handling the above issue makes it more hacky. This series changes x86 IOMMU initialization sequence to handle the above issue cleanly. The new x86 IOMMU initialization sequence are: 1. initializing swiotlb (and setting swiotlb to 1) in the case of (max_pfn > MAX_DMA32_PFN && !no_iommu). dma_ops is set to swiotlb_dma_ops or nommu_dma_ops. 2. calling the detection functions of all the IOMMUs 3. the detection function sets x86_init.iommu.iommu_init to the IOMMU initialization function (so we can avoid calling the initialization functions of all the IOMMUs needlessly). 4. if the IOMMU initialization function doesn't need to swiotlb then sets swiotlb to zero (e.g. the initialization is sucessful). 5. if we find that swiotlb is set to zero, we free swiotlb resource. = arch/ia64/kernel/pci-swiotlb.c | 4 +- arch/powerpc/kernel/setup_32.c | 2 +- arch/powerpc/kernel/setup_64.c | 2 +- arch/x86/include/asm/amd_iommu.h | 2 - arch/x86/include/asm/calgary.h | 2 - arch/x86/include/asm/gart.h | 5 +-- arch/x86/include/asm/iommu.h | 2 +- arch/x86/include/asm/x86_init.h | 9 ++++ arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 19 ++------ arch/x86/kernel/aperture_64.c | 4 +- arch/x86/kernel/pci-calgary_64.c | 19 ++----- arch/x86/kernel/pci-dma.c | 23 ++++----- arch/x86/kernel/pci-gart_64.c | 16 ++---- arch/x86/kernel/pci-nommu.c | 9 ---- arch/x86/kernel/pci-swiotlb.c | 8 ++-- arch/x86/kernel/x86_init.c | 5 ++ drivers/pci/dmar.c | 7 ++- drivers/pci/intel-iommu.c | 4 +- include/linux/bootmem.h | 1 + include/linux/dmar.h | 10 ---- include/linux/swiotlb.h | 5 +- lib/swiotlb.c | 45 +++++++++++++++--- mm/bootmem.c | 98 +++++++++++++++++++++++++++++--------- 24 files changed, 177 insertions(+), 126 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
[PATCH 01/10] Add iommu_init to x86_init_opsWe call the detections functions of all the IOMMUs then all their
initialization functions. THe latter is pointless since we don't detect multiple different IOMMUs. What we need to do is calling the initialization function of the detected IOMMU. This adds iommu_init hook to x86_init_ops so if an IOMMU detection function can set its initialization function to the hook. Signed-off-by: FUJITA Tomonori <fujita.tomonori@...> --- arch/x86/include/asm/iommu.h | 1 + arch/x86/include/asm/x86_init.h | 9 +++++++++ arch/x86/kernel/pci-dma.c | 2 ++ arch/x86/kernel/x86_init.c | 5 +++++ 4 files changed, 17 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index fd6d21b..42aa977 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -3,6 +3,7 @@ extern void pci_iommu_shutdown(void); extern void no_iommu_init(void); +static inline int iommu_init_noop(void) { return 0; } extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 2c756fd..a58644a 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -91,6 +91,14 @@ struct x86_init_timers { }; /** + * struct x86_init_iommu - platform specific iommu setup + * @iommu_init: platform specific iommu setup + */ +struct x86_init_iommu { + int (*iommu_init)(void); +}; + +/** * struct x86_init_ops - functions for platform specific setup * */ @@ -101,6 +109,7 @@ struct x86_init_ops { struct x86_init_oem oem; struct x86_init_paging paging; struct x86_init_timers timers; + struct x86_init_iommu iommu; }; /** diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index b2a71dc..5609973 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -292,6 +292,8 @@ static int __init pci_iommu_init(void) dma_debug_add_bus(&pci_bus_type); #endif + x86_init.iommu.iommu_init(); + calgary_iommu_init(); intel_iommu_init(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 4449a4a..d89ce53 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,6 +14,7 @@ #include <asm/time.h> #include <asm/irq.h> #include <asm/tsc.h> +#include <asm/iommu.h> void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -62,6 +63,10 @@ struct x86_init_ops x86_init __initdata = { .tsc_pre_init = x86_init_noop, .timer_init = hpet_time_init, }, + + .iommu = { + .iommu_init = iommu_init_noop, + }, }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
[PATCH 02/10] Calgary: convert detect_calgary to use iommu_init hookThis changes detect_calgary() to set init_calgary() to iommu_init hook
if detect_calgary() finds the Calgary IOMMU. We can kill the code to check if we found the IOMMU in init_calgary() since detect_calgary() sets init_calgary() only when it found the IOMMU. Signed-off-by: FUJITA Tomonori <fujita.tomonori@...> --- arch/x86/include/asm/calgary.h | 2 -- arch/x86/kernel/pci-calgary_64.c | 11 +++++------ arch/x86/kernel/pci-dma.c | 2 -- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h index b03bedb..0918654 100644 --- a/arch/x86/include/asm/calgary.h +++ b/arch/x86/include/asm/calgary.h @@ -62,10 +62,8 @@ struct cal_chipset_ops { extern int use_calgary; #ifdef CONFIG_CALGARY_IOMMU -extern int calgary_iommu_init(void); extern void detect_calgary(void); #else -static inline int calgary_iommu_init(void) { return 1; } static inline void detect_calgary(void) { return; } #endif diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 971a3be..47bd419 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -46,6 +46,7 @@ #include <asm/dma.h> #include <asm/rio.h> #include <asm/bios_ebda.h> +#include <asm/x86_init.h> #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT int use_calgary __read_mostly = 1; @@ -1344,6 +1345,8 @@ static void __init get_tce_space_from_tar(void) return; } +int __init calgary_iommu_init(void); + void __init detect_calgary(void) { int bus; @@ -1445,6 +1448,8 @@ void __init detect_calgary(void) /* swiotlb for devices that aren't behind the Calgary. */ if (max_pfn > MAX_DMA32_PFN) swiotlb = 1; + + x86_init.iommu.iommu_init = calgary_iommu_init; } return; @@ -1461,12 +1466,6 @@ int __init calgary_iommu_init(void) { int ret; - if (no_iommu || (swiotlb && !calgary_detected)) - return -ENODEV; - - if (!calgary_detected) - return -ENODEV; - /* ok, we're trying to use Calgary - let's roll */ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 5609973..1b06714 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -294,8 +294,6 @@ static int __init pci_iommu_init(void) x86_init.iommu.iommu_init(); - calgary_iommu_init(); - intel_iommu_init(); amd_iommu_init(); -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
[PATCH 03/10] GART: convert gart_iommu_hole_init to use iommu_init hookThis changes gart_iommu_hole_init() to set gart_iommu_init() to
iommu_init hook if gart_iommu_hole_init() finds the GART IOMMU. We can kill the code to check if we found the IOMMU in gart_iommu_init() since gart_iommu_hole_init() sets gart_iommu_init() only when it found the IOMMU. Signed-off-by: FUJITA Tomonori <fujita.tomonori@...> --- arch/x86/include/asm/gart.h | 5 +---- arch/x86/kernel/aperture_64.c | 2 ++ arch/x86/kernel/pci-dma.c | 2 -- arch/x86/kernel/pci-gart_64.c | 15 +++++---------- 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 6cfdafa..51d28a0 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -35,7 +35,7 @@ extern int gart_iommu_aperture_allowed; extern int gart_iommu_aperture_disabled; extern void early_gart_iommu_check(void); -extern void gart_iommu_init(void); +extern int gart_iommu_init(void); extern void gart_iommu_shutdown(void); extern void __init gart_parse_options(char *); extern void gart_iommu_hole_init(void); @@ -48,9 +48,6 @@ extern void gart_iommu_hole_init(void); static inline void early_gart_iommu_check(void) { } -static inline void gart_iommu_init(void) -{ -} static inline void gart_iommu_shutdown(void) { } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 128111d..03933cf 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -28,6 +28,7 @@ #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/k8.h> +#include <asm/x86_init.h> int gart_iommu_aperture; int gart_iommu_aperture_disabled __initdata; @@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void) iommu_detected = 1; gart_iommu_aperture = 1; + x86_init.iommu.iommu_init = gart_iommu_init; aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1b06714..9830f12 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -298,8 +298,6 @@ static int __init pci_iommu_init(void) amd_iommu_init(); - gart_iommu_init(); - no_iommu_init(); return 0; } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index a7f1b64..3bcae8f 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -708,7 +708,7 @@ void gart_iommu_shutdown(void) } } -void __init gart_iommu_init(void) +int __init gart_iommu_init(void) { struct agp_kern_info info; unsigned long iommu_start; @@ -718,7 +718,7 @@ void __init gart_iommu_init(void) long i; if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) - return; + return 0; #ifndef CONFIG_AGP_AMD64 no_agp = 1; @@ -730,13 +730,6 @@ void __init gart_iommu_init(void) (agp_copy_info(agp_bridge, &info) < 0); #endif - if (swiotlb) - return; - - /* Did we detect a different HW IOMMU? */ - if (iommu_detected && !gart_iommu_aperture) - return; - if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || @@ -746,7 +739,7 @@ void __init gart_iommu_init(void) "but GART IOMMU not available.\n"); printk(KERN_WARNING "falling back to iommu=soft.\n"); } - return; + return 0; } /* need to map that range */ @@ -838,6 +831,8 @@ void __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; + + return 0; } void __init gart_parse_options(char *p) -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
[PATCH 04/10] amd_iommu: convert amd_iommu_detect to use iommu_init hookThis changes amd_iommu_detect() to set amd_iommu_init to
iommu_init hook if amd_iommu_detect() finds the AMD IOMMU. We can kill the code to check if we found the IOMMU in amd_iommu_init() since amd_iommu_detect() sets amd_iommu_init() only when it found the IOMMU. Signed-off-by: FUJITA Tomonori <fujita.tomonori@...> --- arch/x86/include/asm/amd_iommu.h | 2 -- arch/x86/kernel/amd_iommu_init.c | 17 +++-------------- arch/x86/kernel/pci-dma.c | 2 -- 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index ac95995..240617e 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -23,7 +23,6 @@ #include <linux/irqreturn.h> #ifdef CONFIG_AMD_IOMMU -extern int amd_iommu_init(void); extern int amd_iommu_init_dma_ops(void); extern int amd_iommu_init_passthrough(void); extern void amd_iommu_detect(void); @@ -32,7 +31,6 @@ extern void amd_iommu_flush_all_domains(void); extern void amd_iommu_flush_all_devices(void); extern void amd_iommu_shutdown(void); #else -static inline int amd_iommu_init(void) { return -ENODEV; } static inline void amd_iommu_detect(void) { } static inline void amd_iommu_shutdown(void) { } #endif diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index b4b61d4..52bf59f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -29,6 +29,7 @@ #include <asm/amd_iommu.h> #include <asm/iommu.h> #include <asm/gart.h> +#include <asm/x86_init.h> /* * definitions for the ACPI scanning code @@ -1154,19 +1155,10 @@ static struct sys_device device_amd_iommu = { * functions. Finally it prints some information about AMD IOMMUs and * the driver state and enables the hardware. */ -int __init amd_iommu_init(void) +static int __init amd_iommu_init(void) { int i, ret = 0; - - if (no_iommu) { - printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); - return 0; - } - - if (!amd_iommu_detected) - return -ENODEV; - /* * First parse ACPI tables to find the largest Bus/Dev/Func * we need to handle. Upon this information the shared data @@ -1326,10 +1318,7 @@ void __init amd_iommu_detect(void) if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; amd_iommu_detected = 1; -#ifdef CONFIG_GART_IOMMU - gart_iommu_aperture_disabled = 1; - gart_iommu_aperture = 0; -#endif + x86_init.iommu.iommu_init = amd_iommu_init; } } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 9830f12..6abb8c2 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -296,8 +296,6 @@ static int __init pci_iommu_init(void) intel_iommu_init(); - amd_iommu_init(); - no_iommu_init(); return 0; } -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
[PATCH 05/10] intel-iommu: convert detect_intel_iommu to use iommu_init hookThis changes detect_intel_iommu() to set intel_iommu_init() to
iommu_init hook if detect_intel_iommu() finds the IOMMU. Signed-off-by: FUJITA Tomonori <fujita.tomonori@...> --- arch/x86/kernel/pci-dma.c | 2 -- drivers/pci/dmar.c | 4 ++++ include/linux/dmar.h | 10 ---------- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 6abb8c2..f70b766 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -294,8 +294,6 @@ static int __init pci_iommu_init(void) x86_init.iommu.iommu_init(); - intel_iommu_init(); - no_iommu_init(); return 0; } diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 22b02c6..bce9cd7 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -617,6 +617,10 @@ void __init detect_intel_iommu(void) !dmar_disabled) iommu_detected = 1; #endif +#ifdef CONFIG_X86 + if (ret) + x86_init.iommu.iommu_init = intel_iommu_init; +#endif } early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size); dmar_tbl = NULL; diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 4a2b162..3d12f28 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -208,16 +208,6 @@ struct dmar_atsr_unit { u8 include_all:1; /* include all ports */ }; -/* Intel DMAR initialization functions */ extern int intel_iommu_init(void); -#else -static inline int intel_iommu_init(void) -{ -#ifdef CONFIG_INTR_REMAP - return dmar_dev_scope_init(); -#else - return -ENODEV; #endif -} -#endif /* !CONFIG_DMAR */ #endif /* __DMAR_H__ */ -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
[PATCH 06/10] bootmem: refactor free_all_bootmem_coreFrom: Chris Wright <chrisw@...>
Move the loop that frees all bootmem pages back to page allocator into its own function. This should have not functional effect and allows the function to be reused later. Reviewed-by: FUJITA Tomonori <fujita.tomonori@...> Signed-off-by: Chris Wright <chrisw@...> --- mm/bootmem.c | 61 +++++++++++++++++++++++++++++++++++++++------------------- 1 files changed, 41 insertions(+), 20 deletions(-) diff --git a/mm/bootmem.c b/mm/bootmem.c index 555d5d2..94ef2e7 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -143,17 +143,22 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } -static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) +/** + * free_bootmem_pages - frees bootmem pages to page allocator + * @start: start pfn + * @end: end pfn + * @map: bootmem bitmap of reserved pages + * + * This will free the pages in the range @start to @end, making them + * available to the page allocator. The @map will be used to skip + * reserved pages. Returns the count of pages freed. + */ +static unsigned long __init free_bootmem_pages(unsigned long start, + unsigned long end, + unsigned long *map) { + unsigned long cursor, count = 0; int aligned; - struct page *page; - unsigned long start, end, pages, count = 0; - - if (!bdata->node_bootmem_map) - return 0; - - start = bdata->node_min_pfn; - end = bdata->node_low_pfn; /* * If the start is aligned to the machines wordsize, we might @@ -161,27 +166,25 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) */ aligned = !(start & (BITS_PER_LONG - 1)); - bdebug("nid=%td start=%lx end=%lx aligned=%d\n", - bdata - bootmem_node_data, start, end, aligned); + for (cursor = start; cursor < end; cursor += BITS_PER_LONG) { + unsigned long idx, vec; - while (start < end) { - unsigned long *map, idx, vec; - - map = bdata->node_bootmem_map; - idx = start - bdata->node_min_pfn; + idx = cursor - start; vec = ~map[idx / BITS_PER_LONG]; - if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { + if (aligned && vec == ~0UL && cursor + BITS_PER_LONG < end) { int order = ilog2(BITS_PER_LONG); - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(cursor), order); count += BITS_PER_LONG; } else { unsigned long off = 0; while (vec && off < BITS_PER_LONG) { if (vec & 1) { - page = pfn_to_page(start + off); + struct page *page; + + page = pfn_to_page(cursor + off); __free_pages_bootmem(page, 0); count++; } @@ -189,8 +192,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) off++; } } - start += BITS_PER_LONG; } + return count; +} + +static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) +{ + struct page *page; + unsigned long start, end, *map, pages, count = 0; + + if (!bdata->node_bootmem_map) + return 0; + + start = bdata->node_min_pfn; + end = bdata->node_low_pfn; + map = bdata->node_bootmem_map; + + bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, + start, end); + + count = free_bootmem_pages(start, end, map); page = virt_to_page(bdata->node_bootmem_map); pages = bdata->node_low_pfn - bdata->node_min_pfn; -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
[PATCH 07/10] bootmem: add free_bootmem_lateFrom: Chris Wright <chrisw@...>
Add a new function for freeing bootmem after the bootmem allocator has been released and the unreserved pages given to the page allocator. This allows us to reserve bootmem and then release it if we later discover it was not needed. Reviewed-by: FUJITA Tomonori <fujita.tomonori@...> Signed-off-by: Chris Wright <chrisw@...> --- include/linux/bootmem.h | 1 + mm/bootmem.c | 43 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index dd97fb8..b10ec49 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat, unsigned long addr, unsigned long size); extern void free_bootmem(unsigned long addr, unsigned long size); +extern void free_bootmem_late(unsigned long addr, unsigned long size); /* * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, diff --git a/mm/bootmem.c b/mm/bootmem.c index 94ef2e7..6c04e6e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -151,7 +151,9 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) * * This will free the pages in the range @start to @end, making them * available to the page allocator. The @map will be used to skip - * reserved pages. Returns the count of pages freed. + * reserved pages. In the case that @map is NULL, the bootmem allocator + * is already free and the range is contiguous. Returns the count of + * pages freed. */ static unsigned long __init free_bootmem_pages(unsigned long start, unsigned long end, @@ -164,13 +166,23 @@ static unsigned long __init free_bootmem_pages(unsigned long start, * If the start is aligned to the machines wordsize, we might * be able to free pages in bulks of that order. */ - aligned = !(start & (BITS_PER_LONG - 1)); + if (map) + aligned = !(start & (BITS_PER_LONG - 1)); + else + aligned = 1; for (cursor = start; cursor < end; cursor += BITS_PER_LONG) { - unsigned long idx, vec; + unsigned long vec; - idx = cursor - start; - vec = ~map[idx / BITS_PER_LONG]; + if (map) { + unsigned long idx = cursor - start; + vec = ~map[idx / BITS_PER_LONG]; + } else { + if (end - cursor >= BITS_PER_LONG) + vec = ~0UL; + else + vec = (1UL << (end - cursor)) - 1; + } if (aligned && vec == ~0UL && cursor + BITS_PER_LONG < end) { int order = ilog2(BITS_PER_LONG); @@ -387,6 +399,27 @@ void __init free_bootmem(unsigned long addr, unsigned long size) } /** + * free_bootmem_late - free bootmem pages directly to page allocator + * @addr: starting address of the range + * @size: size of the range in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are given directly + * to the page allocator, no bootmem metadata is updated because it is gone. + */ +void __init free_bootmem_late(unsigned long addr, unsigned long size) +{ + unsigned long start, end; + + kmemleak_free_part(__va(addr), size); + + start = PFN_UP(addr); + end = PFN_DOWN(addr + size); + + totalram_pages += free_bootmem_pages(start, end, NULL); +} + +/** * reserve_bootmem_node - mark a page range as reserved * @pgdat: node the range resides on * @physaddr: starting address of the range -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
[PATCH 08/10] swiotlb: add swiotlb_free functionswiotlb_free function frees all allocated memory for swiotlb.
We need to initialize swiotlb before IOMMU initialization (x86 and powerpc needs to allocate memory from bootmem allocator). If IOMMU initialization is successful, we need to free swiotlb resource (don't want to waste 64MB). Signed-off-by: FUJITA Tomonori <fujita.tomonori@...> --- include/linux/swiotlb.h | 1 + lib/swiotlb.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 0 deletions(-) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 73b1f1c..51b6cc2 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -88,4 +88,5 @@ swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr); extern int swiotlb_dma_supported(struct device *hwdev, u64 mask); +extern void __init swiotlb_free(void); #endif /* __LINUX_SWIOTLB_H */ diff --git a/lib/swiotlb.c b/lib/swiotlb.c index ac25cd2..eee512b 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -97,6 +97,8 @@ static phys_addr_t *io_tlb_orig_addr; */ static DEFINE_SPINLOCK(io_tlb_lock); +static int late_alloc; + static int __init setup_io_tlb_npages(char *str) { @@ -262,6 +264,8 @@ swiotlb_late_init_with_default_size(size_t default_size) swiotlb_print_info(bytes); + late_alloc = 1; + return 0; cleanup4: @@ -281,6 +285,32 @@ cleanup1: return -ENOMEM; } +void __init swiotlb_free(void) +{ + if (!io_tlb_overflow_buffer) + return; + + if (late_alloc) { + free_pages((unsigned long)io_tlb_overflow_buffer, + get_order(io_tlb_overflow)); + free_pages((unsigned long)io_tlb_orig_addr, + get_order(io_tlb_nslabs * sizeof(phys_addr_t))); + free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * + sizeof(int))); + free_pages((unsigned long)io_tlb_start, + get_order(io_tlb_nslabs << IO_TLB_SHIFT)); + } else { + free_bootmem_late(__pa(io_tlb_overflow_buffer), + io_tlb_overflow); + free_bootmem_late(__pa(io_tlb_orig_addr), + io_tlb_nslabs * sizeof(phys_addr_t)); + free_bootmem_late(__pa(io_tlb_list), + io_tlb_nslabs * sizeof(int)); + free_bootmem_late(__pa(io_tlb_start), + io_tlb_nslabs << IO_TLB_SHIFT); + } +} + static int is_swiotlb_buffer(phys_addr_t paddr) { return paddr >= virt_to_phys(io_tlb_start) && -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
[PATCH 09/10] swiotlb: export swiotlb_print_infoThis enables us to avoid printing swiotlb memory info when we
initialize swiotlb. After swiotlb initialization, we could find that we dont' need swiotlb. This patch removes the code to print swiotlb memory info in swiotlb_init() and exports the function to do that. Cc: tony.luck@... Cc: benh@... Signed-off-by: FUJITA Tomonori <fujita.tomonori@...> --- arch/ia64/kernel/pci-swiotlb.c | 4 ++-- arch/powerpc/kernel/setup_32.c | 2 +- arch/powerpc/kernel/setup_64.c | 2 +- arch/x86/kernel/pci-swiotlb.c | 3 +-- include/linux/swiotlb.h | 4 ++-- lib/swiotlb.c | 15 ++++++++------- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c index 285aae8..53292ab 100644 --- a/arch/ia64/kernel/pci-swiotlb.c +++ b/arch/ia64/kernel/pci-swiotlb.c @@ -41,7 +41,7 @@ struct dma_map_ops swiotlb_dma_ops = { void __init swiotlb_dma_init(void) { dma_ops = &swiotlb_dma_ops; - swiotlb_init(); + swiotlb_init(1); } void __init pci_swiotlb_init(void) @@ -51,7 +51,7 @@ void __init pci_swiotlb_init(void) swiotlb = 1; printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n"); machvec_init("dig"); - swiotlb_init(); + swiotlb_init(1); dma_ops = &swiotlb_dma_ops; #else panic("Unable to find Intel IOMMU"); diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 53bcf3d..b152de3 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -345,7 +345,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_SWIOTLB if (ppc_swiotlb_enable) - swiotlb_init(); + swiotlb_init(1); #endif paging_init(); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 797ea95..1efbcaa 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -551,7 +551,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_SWIOTLB if (ppc_swiotlb_enable) - swiotlb_init(); + swiotlb_init(1); #endif paging_init(); diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index aaa6b78..ea20ef7 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -52,8 +52,7 @@ void __init pci_swiotlb_init(void) if (swiotlb_force) swiotlb = 1; if (swiotlb) { - printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_init(); + swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } } diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 51b6cc2..6b70068 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -20,8 +20,7 @@ struct scatterlist; */ #define IO_TLB_SHIFT 11 -extern void -swiotlb_init(void); +extern void swiotlb_init(int verbose); extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, @@ -89,4 +88,5 @@ extern int swiotlb_dma_supported(struct device *hwdev, u64 mask); extern void __init swiotlb_free(void); +extern void swiotlb_print_info(void); #endif /* __LINUX_SWIOTLB_H */ diff --git a/lib/swiotlb.c b/lib/swiotlb.c index eee512b..0c12d7c 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -123,8 +123,9 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, return phys_to_dma(hwdev, virt_to_phys(address)); } -static void swiotlb_print_info(unsigned long bytes) +void swiotlb_print_info(void) { + unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; phys_addr_t pstart, pend; pstart = virt_to_phys(io_tlb_start); @@ -142,7 +143,7 @@ static void swiotlb_print_info(unsigned long bytes) * structures for the software IO TLB used to implement the DMA API. */ void __init -swiotlb_init_with_default_size(size_t default_size) +swiotlb_init_with_default_size(size_t default_size, int verbose) { unsigned long i, bytes; @@ -178,14 +179,14 @@ swiotlb_init_with_default_size(size_t default_size) io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); if (!io_tlb_overflow_buffer) panic("Cannot allocate SWIOTLB overflow buffer!\n"); - - swiotlb_print_info(bytes); + if (verbose) + swiotlb_print_info(); } void __init -swiotlb_init(void) +swiotlb_init(int verbose) { - swiotlb_init_with_default_size(64 * (1<<20)); /* default to 64MB */ + swiotlb_init_with_default_size(64 * (1<<20), verbose); /* default to 64MB */ } /* @@ -262,7 +263,7 @@ swiotlb_late_init_with_default_size(size_t default_size) if (!io_tlb_overflow_buffer) goto cleanup4; - swiotlb_print_info(bytes); + swiotlb_print_info(); late_alloc = 1; -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
[PATCH 10/10] x86: handle HW IOMMU initialization failure gracelyIf HW IOMMU initialization fails (Intel VT-d often does typically due
to BIOS bugs), we fall back to nommu. It doesn't work for the majority since nowadays we have more than 4GB memory so we must use swiotlb instead of nommu. The problem is that it's too late to initialize swiotlb when HW IOMMU initialization fails. We need to allocate swiotlb memory earlier from bootmem allocator. Chris explained the issue in detail: http://marc.info/?l=linux-kernel&m=125657444317079&w=2 The current x86 IOMMU initialization sequence is too complicated and handling the above issue makes it more hacky. This patch changes x86 IOMMU initialization sequence to handle the above issue cleanly. The new x86 IOMMU initialization sequence are: 1. initializing swiotlb (and setting swiotlb to 1) in the case of (max_pfn > MAX_DMA32_PFN && !no_iommu). dma_ops is set to swiotlb_dma_ops or nommu_dma_ops. 2. calling the detection functions of all the IOMMUs 3. the detection function sets x86_init.iommu.iommu_init to the IOMMU initialization function (so we can avoid calling the initialization functions of all the IOMMUs needlessly). 4. if the IOMMU initialization function doesn't need to swiotlb then sets swiotlb to zero (e.g. the initialization is sucessful). 5. if we find that swiotlb is set to zero, we free swiotlb resource. Signed-off-by: FUJITA Tomonori <fujita.tomonori@...> --- arch/x86/include/asm/iommu.h | 1 - arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 2 +- arch/x86/kernel/aperture_64.c | 2 +- arch/x86/kernel/pci-calgary_64.c | 10 +--------- arch/x86/kernel/pci-dma.c | 17 +++++++++-------- arch/x86/kernel/pci-gart_64.c | 1 + arch/x86/kernel/pci-nommu.c | 9 --------- arch/x86/kernel/pci-swiotlb.c | 5 +++-- drivers/pci/dmar.c | 3 +-- drivers/pci/intel-iommu.c | 4 ++-- 11 files changed, 20 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 42aa977..9254199 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -2,7 +2,6 @@ #define _ASM_X86_IOMMU_H extern void pci_iommu_shutdown(void); -extern void no_iommu_init(void); static inline int iommu_init_noop(void) { return 0; } extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 98f230f..383bf83 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2108,8 +2108,8 @@ int __init amd_iommu_init_dma_ops(void) prealloc_protection_domains(); iommu_detected = 1; - force_iommu = 1; bad_dma_address = 0; + swiotlb = 0; #ifdef CONFIG_GART_IOMMU gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 52bf59f..5eda21a 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1312,7 +1312,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) void __init amd_iommu_detect(void) { - if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) + if (no_iommu || (iommu_detected && !gart_iommu_aperture)) return; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 03933cf..e0dfb68 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -458,7 +458,7 @@ out: if (aper_alloc) { /* Got the aperture from the AGP bridge */ - } else if (swiotlb && !valid_agp) { + } else if (!valid_agp) { /* Do nothing */ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || force_iommu || diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 47bd419..833f491 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1360,7 +1360,7 @@ void __init detect_calgary(void) * if the user specified iommu=off or iommu=soft or we found * another HW IOMMU already, bail out. */ - if (swiotlb || no_iommu || iommu_detected) + if (no_iommu || iommu_detected) return; if (!use_calgary) @@ -1445,10 +1445,6 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", specified_table_size); - /* swiotlb for devices that aren't behind the Calgary. */ - if (max_pfn > MAX_DMA32_PFN) - swiotlb = 1; - x86_init.iommu.iommu_init = calgary_iommu_init; } return; @@ -1476,11 +1472,7 @@ int __init calgary_iommu_init(void) return ret; } - force_iommu = 1; bad_dma_address = 0x0; - /* dma_ops is set to swiotlb or nommu */ - if (!dma_ops) - dma_ops = &nommu_dma_ops; return 0; } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f70b766..28fd54a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -128,20 +128,16 @@ void __init pci_iommu_alloc(void) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif + pci_swiotlb_init(); - /* - * The order of these functions is important for - * fall-back/fail-over reasons - */ gart_iommu_hole_init(); detect_calgary(); detect_intel_iommu(); + /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); - - pci_swiotlb_init(); } void *dma_generic_alloc_coherent(struct device *dev, size_t size, @@ -291,10 +287,15 @@ static int __init pci_iommu_init(void) #ifdef CONFIG_PCI dma_debug_add_bus(&pci_bus_type); #endif - x86_init.iommu.iommu_init(); - no_iommu_init(); + if (swiotlb) { + printk(KERN_INFO "PCI-DMA: " + "Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else + swiotlb_free(); + return 0; } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 3bcae8f..e36f5f4 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -831,6 +831,7 @@ int __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; + swiotlb = 0; return 0; } diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a3933d4..875e382 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = { .sync_sg_for_device = nommu_sync_sg_for_device, .is_phys = 1, }; - -void __init no_iommu_init(void) -{ - if (dma_ops) - return; - - force_iommu = 0; /* no HW IOMMU */ - dma_ops = &nommu_dma_ops; -} diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index ea20ef7..607fbb6 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -46,7 +46,7 @@ void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) + if (!no_iommu && max_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif if (swiotlb_force) @@ -54,5 +54,6 @@ void __init pci_swiotlb_init(void) if (swiotlb) { swiotlb_init(0); dma_ops = &swiotlb_dma_ops; - } + } else + dma_ops = &nommu_dma_ops; } diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index bce9cd7..4373996 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -613,8 +613,7 @@ void __init detect_intel_iommu(void) "x2apic and Intr-remapping.\n"); #endif #ifdef CONFIG_DMAR - if (ret && !no_iommu && !iommu_detected && !swiotlb && - !dmar_disabled) + if (ret && !no_iommu && !iommu_detected && !dmar_disabled) iommu_detected = 1; #endif #ifdef CONFIG_X86 diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index b1e97e6..c7a8bcf 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -3231,7 +3231,7 @@ int __init intel_iommu_init(void) * Check the need for DMA-remapping initialization now. * Above initialization will also be used by Interrupt-remapping. */ - if (no_iommu || swiotlb || dmar_disabled) + if (no_iommu || dmar_disabled) return -ENODEV; iommu_init_mempool(); @@ -3252,7 +3252,7 @@ int __init intel_iommu_init(void) "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n"); init_timer(&unmap_timer); - force_iommu = 1; + swiotlb = 0; dma_ops = &intel_dma_ops; init_iommu_sysfs(); -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
Re: [PATCH 06/10] bootmem: refactor free_all_bootmem_core* FUJITA Tomonori <fujita.tomonori@...> wrote: > From: Chris Wright <chrisw@...> > > Move the loop that frees all bootmem pages back to page allocator into > its own function. This should have not functional effect and allows the > function to be reused later. > > Reviewed-by: FUJITA Tomonori <fujita.tomonori@...> > Signed-off-by: Chris Wright <chrisw@...> Small signoff-handling sidenote: in such cases (when you are handling other people's patches) the expected workflow is to add your SOB line after Chris's line, instead of the Reviewed-by line. I.e. it should look like this: Signed-off-by: Chris Wright <chrisw@...> Signed-off-by: FUJITA Tomonori <fujita.tomonori@...> Showing the path of the patch - ending with your signoff. Thanks, Ingo -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
Re: [PATCH 06/10] bootmem: refactor free_all_bootmem_core* FUJITA Tomonori <fujita.tomonori@...> wrote: > From: Chris Wright <chrisw@...> > > Move the loop that frees all bootmem pages back to page allocator into > its own function. This should have not functional effect and allows the > function to be reused later. > > Reviewed-by: FUJITA Tomonori <fujita.tomonori@...> > Signed-off-by: Chris Wright <chrisw@...> > --- > mm/bootmem.c | 61 +++++++++++++++++++++++++++++++++++++++------------------- > 1 files changed, 41 insertions(+), 20 deletions(-) > > diff --git a/mm/bootmem.c b/mm/bootmem.c > index 555d5d2..94ef2e7 100644 > --- a/mm/bootmem.c > +++ b/mm/bootmem.c > @@ -143,17 +143,22 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) > return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); > } > > -static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) > +/** > + * free_bootmem_pages - frees bootmem pages to page allocator > + * @start: start pfn > + * @end: end pfn > + * @map: bootmem bitmap of reserved pages > + * > + * This will free the pages in the range @start to @end, making them > + * available to the page allocator. The @map will be used to skip > + * reserved pages. Returns the count of pages freed. > + */ > +static unsigned long __init free_bootmem_pages(unsigned long start, > + unsigned long end, > + unsigned long *map) > { > + unsigned long cursor, count = 0; > int aligned; > - struct page *page; > - unsigned long start, end, pages, count = 0; > - > - if (!bdata->node_bootmem_map) > - return 0; > - > - start = bdata->node_min_pfn; > - end = bdata->node_low_pfn; > > /* > * If the start is aligned to the machines wordsize, we might > @@ -161,27 +166,25 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) > */ > aligned = !(start & (BITS_PER_LONG - 1)); > > - bdebug("nid=%td start=%lx end=%lx aligned=%d\n", > - bdata - bootmem_node_data, start, end, aligned); > + for (cursor = start; cursor < end; cursor += BITS_PER_LONG) { > + unsigned long idx, vec; > > - while (start < end) { > - unsigned long *map, idx, vec; > - > - map = bdata->node_bootmem_map; > - idx = start - bdata->node_min_pfn; > + idx = cursor - start; > vec = ~map[idx / BITS_PER_LONG]; > > - if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { > + if (aligned && vec == ~0UL && cursor + BITS_PER_LONG < end) { > int order = ilog2(BITS_PER_LONG); > > - __free_pages_bootmem(pfn_to_page(start), order); > + __free_pages_bootmem(pfn_to_page(cursor), order); > count += BITS_PER_LONG; > } else { > unsigned long off = 0; > > while (vec && off < BITS_PER_LONG) { > if (vec & 1) { > - page = pfn_to_page(start + off); > + struct page *page; > + > + page = pfn_to_page(cursor + off); > __free_pages_bootmem(page, 0); > count++; > } > @@ -189,8 +192,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) > off++; > } > } > - start += BITS_PER_LONG; > } > + return count; > +} > + > +static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) > +{ > + struct page *page; > + unsigned long start, end, *map, pages, count = 0; > + > + if (!bdata->node_bootmem_map) > + return 0; > + > + start = bdata->node_min_pfn; > + end = bdata->node_low_pfn; > + map = bdata->node_bootmem_map; > + > + bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, > + start, end); > + > + count = free_bootmem_pages(start, end, map); > > page = virt_to_page(bdata->node_bootmem_map); > pages = bdata->node_low_pfn - bdata->node_min_pfn; Small nit: the initialization of count to 0 seems superfluous. Ingo -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
Re: [PATCH 07/10] bootmem: add free_bootmem_late* FUJITA Tomonori <fujita.tomonori@...> wrote: > From: Chris Wright <chrisw@...> > > Add a new function for freeing bootmem after the bootmem allocator has > been released and the unreserved pages given to the page allocator. > This allows us to reserve bootmem and then release it if we later > discover it was not needed. > > Reviewed-by: FUJITA Tomonori <fujita.tomonori@...> > Signed-off-by: Chris Wright <chrisw@...> > --- > include/linux/bootmem.h | 1 + > mm/bootmem.c | 43 ++++++++++++++++++++++++++++++++++++++----- > 2 files changed, 39 insertions(+), 5 deletions(-) Hm, we are now further complicating the bootmem model. I think we could remove the bootmem allocator middle man altogether. This can be done by initializing the page allocator sooner and by extending (already existing) 'reserve memory early on' mechanisms in architecture code. (the reserve_early*() APIs in x86 for example) Right now we have 5 memory allocation models on x86, initialized gradually: - allocator (buddy) [generic] - early allocator (bootmem) [generic] - very early allocator (reserve_early*()) [x86] - very very early allocator (early brk model) [x86] - very very very early allocator (build time .data/.bss) [generic] Seems excessive. The reserve_early() method is list/range based and can handle vast amounts of not very fragmented memory - perfect for basically all the real bootmem purposes (which is to bootstrap the buddy). reserve_early() allocated memory could be freed into the buddy later on as well. The main reason why bootmem is 'destroyed' during free-to-buddy is because it has excessive internal bitmaps we want to free. With a list/range based reserve_early() mechanism there's no such problem - they can linger indefinitely and there's near zero allocation management overhead. reserve_early() might need some small amount of extra work before it can be used as a generic early allocator - like adding a node field to it (so that the buddy can then pick those ranges up in a NUMA aware fashion) - but nothing very complex. Thoughts? Ingo -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
Re: [PATCH 07/10] bootmem: add free_bootmem_lateHello,
Ingo Molnar wrote: > Hm, we are now further complicating the bootmem model. > > I think we could remove the bootmem allocator middle man altogether. > > This can be done by initializing the page allocator sooner and by > extending (already existing) 'reserve memory early on' mechanisms in > architecture code. (the reserve_early*() APIs in x86 for example) ... > reserve_early() might need some small amount of extra work before it can > be used as a generic early allocator - like adding a node field to it > (so that the buddy can then pick those ranges up in a NUMA aware > fashion) - but nothing very complex. ISTR an attempt to initialize the kmalloc allocator much earlier during boot such that it can completely replace the bootmem allocator, which would nicely remove all the complications although it may require the kmalloc allocator to go through more complex boot strapping steps. I didn't follow how that went. Did it prove to be unworkable? As for percpu allocator, as long as it can grab memory in NUMA aware way, changing shouldn't be a problem at all. The good thing about bootmem allocator is that it is generic and standardizes memory bootstrapping across different architectures. As long as that can be maintained, makings things simpler would be great. Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
Re: [PATCH 07/10] bootmem: add free_bootmem_lateHi Tejun,
On Wed, Oct 28, 2009 at 10:00 AM, Tejun Heo <tj@...> wrote: > ISTR an attempt to initialize the kmalloc allocator much earlier > during boot such that it can completely replace the bootmem allocator, > which would nicely remove all the complications although it may > require the kmalloc allocator to go through more complex boot > strapping steps. I didn't follow how that went. Did it prove to be > unworkable? We're doing it before scheduler init now but I haven't put any effort into moving it earlier than that yet. I don't see any fundamental reason we can't do that but the practical problem is that we're going to affect architecture specific boot code which is really hard to test. Pekka -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
Re: [PATCH 07/10] bootmem: add free_bootmem_lateHello,
Pekka Enberg wrote: > On Wed, Oct 28, 2009 at 10:00 AM, Tejun Heo <tj@...> wrote: >> ISTR an attempt to initialize the kmalloc allocator much earlier >> during boot such that it can completely replace the bootmem allocator, >> which would nicely remove all the complications although it may >> require the kmalloc allocator to go through more complex boot >> strapping steps. I didn't follow how that went. Did it prove to be >> unworkable? > > We're doing it before scheduler init now but I haven't put any effort > into moving it earlier than that yet. I don't see any fundamental > reason we can't do that but the practical problem is that we're going > to affect architecture specific boot code which is really hard to > test. Thanks for the explanation. It would be really great if we can pull that off someday. This should be doable architecture-by-architecture, right? You can, for example, first convert x86 and then make bootmem allocator thin wrapper around the slab allocator. After all archs have been converted, the wrappers can be dropped. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
Re: [PATCH 0/10] x86: handle HW IOMMU initialization failure gracelyOn Wed, Oct 28, 2009 at 03:47:34PM +0900, FUJITA Tomonori wrote:
> If HW IOMMU initialization fails (Intel VT-d often does typically due > to BIOS bugs), we fall back to nommu. It doesn't work for the majority > since nowadays we have more than 4GB memory so we must use swiotlb > instead of nommu. > > The problem is that it's too late to initialize swiotlb when HW IOMMU > initialization fails. We need to allocate swiotlb memory earlier from > bootmem allocator. Chris explained the issue in detail: > > http://marc.info/?l=linux-kernel&m=125657444317079&w=2 > > > The current x86 IOMMU initialization sequence is too complicated and > handling the above issue makes it more hacky. > > This series changes x86 IOMMU initialization sequence to handle the > above issue cleanly. > > The new x86 IOMMU initialization sequence are: > > 1. initializing swiotlb (and setting swiotlb to 1) in the case of > (max_pfn > MAX_DMA32_PFN && !no_iommu). dma_ops is set to > swiotlb_dma_ops or nommu_dma_ops. > > 2. calling the detection functions of all the IOMMUs > > 3. the detection function sets x86_init.iommu.iommu_init to the IOMMU > initialization function (so we can avoid calling the initialization > functions of all the IOMMUs needlessly). > > 4. if the IOMMU initialization function doesn't need to swiotlb then > sets swiotlb to zero (e.g. the initialization is sucessful). > > 5. if we find that swiotlb is set to zero, we free swiotlb resource. I started to test this patchset. It breaks the iommu=soft selection to force using swiotlb usage. On my machine always the AMD IOMMU driver gots selected and initialized. Joerg -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
Re: [PATCH 02/10] Calgary: convert detect_calgary to use iommu_init hookOn Wed, Oct 28, 2009 at 03:47:36PM +0900, FUJITA Tomonori wrote:
> This changes detect_calgary() to set init_calgary() to iommu_init hook > if detect_calgary() finds the Calgary IOMMU. > > We can kill the code to check if we found the IOMMU in init_calgary() > since detect_calgary() sets init_calgary() only when it found the > IOMMU. I like this patchset a lot. Our with the old cruft! > Signed-off-by: FUJITA Tomonori <fujita.tomonori@...> Acked-by: Muli Ben-Yehuda <muli@...> Cheers, Muli -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
|
|
Re: [PATCH 0/10] x86: handle HW IOMMU initialization failure gracelyOn Wed, 28 Oct 2009 14:21:19 +0100
Joerg Roedel <joerg.roedel@...> wrote: > > The new x86 IOMMU initialization sequence are: > > > > 1. initializing swiotlb (and setting swiotlb to 1) in the case of > > (max_pfn > MAX_DMA32_PFN && !no_iommu). dma_ops is set to > > swiotlb_dma_ops or nommu_dma_ops. > > > > 2. calling the detection functions of all the IOMMUs > > > > 3. the detection function sets x86_init.iommu.iommu_init to the IOMMU > > initialization function (so we can avoid calling the initialization > > functions of all the IOMMUs needlessly). > > > > 4. if the IOMMU initialization function doesn't need to swiotlb then > > sets swiotlb to zero (e.g. the initialization is sucessful). > > > > 5. if we find that swiotlb is set to zero, we free swiotlb resource. > > I started to test this patchset. It breaks the iommu=soft selection to > force using swiotlb usage. On my machine always the AMD IOMMU driver > gots selected and initialized. Thanks for testing the patchset. Sorry about the bug. I need to add 1.5 to the above sequence: 1.5 if swiotlb usage is forced by the boot option, we skip the rest. Here's a patch against the patchset. diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h index dcbaea7..f0acde6 100644 --- a/arch/ia64/include/asm/swiotlb.h +++ b/arch/ia64/include/asm/swiotlb.h @@ -4,8 +4,6 @@ #include <linux/dma-mapping.h> #include <linux/swiotlb.h> -extern int swiotlb_force; - #ifdef CONFIG_SWIOTLB extern int swiotlb; extern void pci_swiotlb_init(void); diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index b9e4e20..193aa75 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -5,8 +5,6 @@ /* SWIOTLB interface */ -extern int swiotlb_force; - #ifdef CONFIG_SWIOTLB extern int swiotlb; extern void pci_swiotlb_init(void); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 28fd54a..2583a0b 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -124,11 +124,15 @@ static void __init dma32_free_bootmem(void) void __init pci_iommu_alloc(void) { + /* swiotlb is forced by the boot option */ + int use_swiotlb = swiotlb; #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif pci_swiotlb_init(); + if (use_swiotlb) + return; gart_iommu_hole_init(); diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 607fbb6..17ce422 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -49,8 +49,6 @@ void __init pci_swiotlb_init(void) if (!no_iommu && max_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif - if (swiotlb_force) - swiotlb = 1; if (swiotlb) { swiotlb_init(0); dma_ops = &swiotlb_dma_ops; diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 0c12d7c..2eae139 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -57,7 +57,7 @@ enum dma_sync_target { SYNC_FOR_DEVICE = 1, }; -int swiotlb_force; +static int swiotlb_force; /* * Used to do a quick range check in unmap_single and @@ -109,8 +109,10 @@ setup_io_tlb_npages(char *str) } if (*str == ',') ++str; - if (!strcmp(str, "force")) + if (!strcmp(str, "force")) { swiotlb_force = 1; + swiotlb = 1; + } return 1; } __setup("swiotlb=", setup_io_tlb_npages); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@... More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |
| < Prev | 1 - 2 | Next > |
| Free embeddable forum powered by Nabble | Forum Help |