summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/i386/kernel/acpi/boot.c5
-rw-r--r--arch/i386/kernel/apic.c23
-rw-r--r--arch/i386/kernel/cpu/cpufreq/powernow-k8.c18
-rw-r--r--arch/i386/kernel/mpparse.c21
-rw-r--r--arch/i386/kernel/reboot_fixups.c2
-rw-r--r--arch/i386/kernel/setup.c36
-rw-r--r--arch/i386/mm/init.c2
-rw-r--r--arch/i386/pci/direct.c9
-rw-r--r--arch/i386/pci/mmconfig.c67
-rw-r--r--arch/sparc64/Kconfig3
-rw-r--r--arch/sparc64/defconfig10
-rw-r--r--arch/sparc64/kernel/ptrace.c23
-rw-r--r--arch/sparc64/kernel/smp.c35
-rw-r--r--arch/sparc64/kernel/traps.c4
-rw-r--r--arch/x86_64/Kconfig5
-rw-r--r--arch/x86_64/Makefile24
-rw-r--r--arch/x86_64/defconfig42
-rw-r--r--arch/x86_64/ia32/ia32entry.S23
-rw-r--r--arch/x86_64/kernel/aperture.c2
-rw-r--r--arch/x86_64/kernel/e820.c36
-rw-r--r--arch/x86_64/kernel/entry.S28
-rw-r--r--arch/x86_64/kernel/mce.c8
-rw-r--r--arch/x86_64/kernel/nmi.c7
-rw-r--r--arch/x86_64/kernel/pci-dma.c2
-rw-r--r--arch/x86_64/kernel/process.c10
-rw-r--r--arch/x86_64/kernel/setup.c4
-rw-r--r--arch/x86_64/kernel/time.c4
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c2
-rw-r--r--arch/x86_64/mm/init.c37
-rw-r--r--arch/x86_64/mm/numa.c46
-rw-r--r--arch/x86_64/mm/srat.c170
-rw-r--r--arch/x86_64/pci/mmconfig.c53
33 files changed, 573 insertions, 190 deletions
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index 033066176b3..8dab3527bc9 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -215,7 +215,7 @@ static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
{
struct acpi_table_madt *madt = NULL;
- if (!phys_addr || !size)
+ if (!phys_addr || !size || !cpu_has_apic)
return -EINVAL;
madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size);
@@ -751,6 +751,9 @@ static int __init acpi_parse_madt_ioapic_entries(void)
return -ENODEV;
}
+ if (!cpu_has_apic)
+ return -ENODEV;
+
/*
* if "noapic" boot option, don't look for IO-APICs
*/
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 6273bf74c20..254cee9f0b7 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -62,6 +62,18 @@ int apic_verbosity;
static void apic_pm_activate(void);
+int modern_apic(void)
+{
+ unsigned int lvr, version;
+ /* AMD systems use old APIC versions, so check the CPU */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+ boot_cpu_data.x86 >= 0xf)
+ return 1;
+ lvr = apic_read(APIC_LVR);
+ version = GET_APIC_VERSION(lvr);
+ return version >= 0x14;
+}
+
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
* each architecture has to answer this themselves.
@@ -119,10 +131,7 @@ void enable_NMI_through_LVT0 (void * dummy)
int get_physical_broadcast(void)
{
- unsigned int lvr, version;
- lvr = apic_read(APIC_LVR);
- version = GET_APIC_VERSION(lvr);
- if (!APIC_INTEGRATED(version) || version >= 0x14)
+ if (modern_apic())
return 0xff;
else
return 0xf;
@@ -349,9 +358,9 @@ int __init verify_local_APIC(void)
void __init sync_Arb_IDs(void)
{
- /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
- unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
- if (ver >= 0x14) /* P4 or higher */
+ /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1
+ And not needed on AMD */
+ if (modern_apic())
return;
/*
* Wait for idle.
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
index 712a26bd445..7c0e160a214 100644
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
@@ -46,7 +46,7 @@
#define PFX "powernow-k8: "
#define BFX PFX "BIOS error: "
-#define VERSION "version 1.60.1"
+#define VERSION "version 1.60.2"
#include "powernow-k8.h"
/* serialize freq changes */
@@ -55,7 +55,7 @@ static DEFINE_MUTEX(fidvid_mutex);
static struct powernow_k8_data *powernow_data[NR_CPUS];
#ifndef CONFIG_SMP
-static cpumask_t cpu_core_map[1] = { CPU_MASK_ALL };
+static cpumask_t cpu_core_map[1];
#endif
/* Return a frequency in MHz, given an input fid */
@@ -910,6 +910,9 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
unsigned int newstate;
int ret = -EIO;
+ if (!data)
+ return -EINVAL;
+
/* only run on specific CPU from here on */
oldmask = current->cpus_allowed;
set_cpus_allowed(current, cpumask_of_cpu(pol->cpu));
@@ -969,6 +972,9 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
{
struct powernow_k8_data *data = powernow_data[pol->cpu];
+ if (!data)
+ return -EINVAL;
+
return cpufreq_frequency_table_verify(pol, data->powernow_table);
}
@@ -977,7 +983,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
{
struct powernow_k8_data *data;
cpumask_t oldmask = CPU_MASK_ALL;
- int rc, i;
+ int rc;
if (!cpu_online(pol->cpu))
return -ENODEV;
@@ -1063,8 +1069,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
printk("cpu_init done, current fid 0x%x, vid 0x%x\n",
data->currfid, data->currvid);
- for_each_cpu_mask(i, cpu_core_map[pol->cpu])
- powernow_data[i] = data;
+ powernow_data[pol->cpu] = data;
return 0;
@@ -1104,6 +1109,9 @@ static unsigned int powernowk8_get (unsigned int cpu)
if (!data)
return -EINVAL;
+ if (!data)
+ return -EINVAL;
+
set_cpus_allowed(current, cpumask_of_cpu(cpu));
if (smp_processor_id() != cpu) {
printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu);
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 8d8aa9d1796..db120174aa7 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -110,21 +110,6 @@ static int __init mpf_checksum(unsigned char *mp, int len)
static int mpc_record;
static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
-#ifdef CONFIG_X86_NUMAQ
-static int MP_valid_apicid(int apicid, int version)
-{
- return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf;
-}
-#else
-static int MP_valid_apicid(int apicid, int version)
-{
- if (version >= 0x14)
- return apicid < 0xff;
- else
- return apicid < 0xf;
-}
-#endif
-
static void __devinit MP_processor_info (struct mpc_config_processor *m)
{
int ver, apicid;
@@ -190,12 +175,6 @@ static void __devinit MP_processor_info (struct mpc_config_processor *m)
ver = m->mpc_apicver;
- if (!MP_valid_apicid(apicid, ver)) {
- printk(KERN_WARNING "Processor #%d INVALID. (Max ID: %d).\n",
- m->mpc_apicid, MAX_APICS);
- return;
- }
-
/*
* Validate version
*/
diff --git a/arch/i386/kernel/reboot_fixups.c b/arch/i386/kernel/reboot_fixups.c
index 10e21a4773d..99aab41a05b 100644
--- a/arch/i386/kernel/reboot_fixups.c
+++ b/arch/i386/kernel/reboot_fixups.c
@@ -51,7 +51,5 @@ void mach_reboot_fixups(void)
cur->reboot_fixup(dev);
}
-
- printk(KERN_WARNING "No reboot fixup found for your hardware\n");
}
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index eacc3f0a2ea..80cb3b2d099 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -963,6 +963,36 @@ efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
return 0;
}
+ /*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init
+e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
+{
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ if (type && ei->type != type)
+ continue;
+ /* is the region (part) in overlap with the current region ?*/
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+ /* if the region is at the beginning of <start,end> we move
+ * start to the end of the region since it's ok until there
+ */
+ if (ei->addr <= start)
+ start = ei->addr + ei->size;
+ /* if start is now at or beyond end, we're done, full
+ * coverage */
+ if (start >= end)
+ return 1; /* we're done */
+ }
+ return 0;
+}
+
/*
* Find the highest page frame number we have available
*/
@@ -1317,8 +1347,8 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat
/*
* Request address space for all standard resources
*
- * This is called just before pcibios_assign_resources(), which is also
- * an fs_initcall, but is linked in later (in arch/i386/pci/i386.c).
+ * This is called just before pcibios_init(), which is also a
+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
*/
static int __init request_standard_resources(void)
{
@@ -1339,7 +1369,7 @@ static int __init request_standard_resources(void)
return 0;
}
-fs_initcall(request_standard_resources);
+subsys_initcall(request_standard_resources);
static void __init register_memory(void)
{
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 9f66ac582a8..ae6534ad816 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -651,6 +651,7 @@ void __init mem_init(void)
* Specifically, in the case of x86, we will always add
* memory to the highmem for now.
*/
+#ifdef CONFIG_HOTPLUG_MEMORY
#ifndef CONFIG_NEED_MULTIPLE_NODES
int add_memory(u64 start, u64 size)
{
@@ -667,6 +668,7 @@ int remove_memory(u64 start, u64 size)
return -EINVAL;
}
#endif
+#endif
kmem_cache_t *pgd_cache;
kmem_cache_t *pmd_cache;
diff --git a/arch/i386/pci/direct.c b/arch/i386/pci/direct.c
index 99012b93bd1..0659ced0118 100644
--- a/arch/i386/pci/direct.c
+++ b/arch/i386/pci/direct.c
@@ -4,6 +4,7 @@
#include <linux/pci.h>
#include <linux/init.h>
+#include <linux/dmi.h>
#include "pci.h"
/*
@@ -18,8 +19,10 @@ int pci_conf1_read(unsigned int seg, unsigned int bus,
{
unsigned long flags;
- if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
+ if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) {
+ *value = -1;
return -EINVAL;
+ }
spin_lock_irqsave(&pci_config_lock, flags);
@@ -188,6 +191,10 @@ static int __init pci_sanity_check(struct pci_raw_ops *o)
if (pci_probe & PCI_NO_CHECKS)
return 1;
+ /* Assume Type 1 works for newer systems.
+ This handles machines that don't have anything on PCI Bus 0. */
+ if (dmi_get_year(DMI_BIOS_DATE) >= 2001)
+ return 1;
for (devfn = 0; devfn < 0x100; devfn++) {
if (o->read(0, 0, devfn, PCI_CLASS_DEVICE, 2, &x))
diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c
index 613789071f3..f77d7f8b9bf 100644
--- a/arch/i386/pci/mmconfig.c
+++ b/arch/i386/pci/mmconfig.c
@@ -12,14 +12,20 @@
#include <linux/pci.h>
#include <linux/init.h>
#include <linux/acpi.h>
+#include <asm/e820.h>
#include "pci.h"
+#define MMCONFIG_APER_SIZE (256*1024*1024)
+
+/* Assume systems with more busses have correct MCFG */
+#define MAX_CHECK_BUS 16
+
#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
/* The base address of the last MMCONFIG device accessed */
static u32 mmcfg_last_accessed_device;
-static DECLARE_BITMAP(fallback_slots, 32);
+static DECLARE_BITMAP(fallback_slots, MAX_CHECK_BUS*32);
/*
* Functions for accessing PCI configuration space with MMCONFIG accesses
@@ -29,8 +35,8 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
int cfg_num = -1;
struct acpi_table_mcfg_config *cfg;
- if (seg == 0 && bus == 0 &&
- test_bit(PCI_SLOT(devfn), fallback_slots))
+ if (seg == 0 && bus < MAX_CHECK_BUS &&
+ test_bit(PCI_SLOT(devfn) + 32*bus, fallback_slots))
return 0;
while (1) {
@@ -74,8 +80,10 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
unsigned long flags;
u32 base;
- if (!value || (bus > 255) || (devfn > 255) || (reg > 4095))
+ if (!value || (bus > 255) || (devfn > 255) || (reg > 4095)) {
+ *value = -1;
return -EINVAL;
+ }
base = get_base_addr(seg, bus, devfn);
if (!base)
@@ -146,29 +154,34 @@ static struct pci_raw_ops pci_mmcfg = {
Normally this can be expressed in the MCFG by not listing them
and assigning suitable _SEGs, but this isn't implemented in some BIOS.
Instead try to discover all devices on bus 0 that are unreachable using MM
- and fallback for them.
- We only do this for bus 0/seg 0 */
+ and fallback for them. */
static __init void unreachable_devices(void)
{
- int i;
+ int i, k;
unsigned long flags;
- for (i = 0; i < 32; i++) {
- u32 val1;
- u32 addr;
-
- pci_conf1_read(0, 0, PCI_DEVFN(i, 0), 0, 4, &val1);
- if (val1 == 0xffffffff)
- continue;
-
- /* Locking probably not needed, but safer */
- spin_lock_irqsave(&pci_config_lock, flags);
- addr = get_base_addr(0, 0, PCI_DEVFN(i, 0));
- if (addr != 0)
- pci_exp_set_dev_base(addr, 0, PCI_DEVFN(i, 0));
- if (addr == 0 || readl((u32 __iomem *)mmcfg_virt_addr) != val1)
- set_bit(i, fallback_slots);
- spin_unlock_irqrestore(&pci_config_lock, flags);
+ for (k = 0; k < MAX_CHECK_BUS; k++) {
+ for (i = 0; i < 32; i++) {
+ u32 val1;
+ u32 addr;
+
+ pci_conf1_read(0, k, PCI_DEVFN(i, 0), 0, 4, &val1);
+ if (val1 == 0xffffffff)
+ continue;
+
+ /* Locking probably not needed, but safer */
+ spin_lock_irqsave(&pci_config_lock, flags);
+ addr = get_base_addr(0, k, PCI_DEVFN(i, 0));
+ if (addr != 0)
+ pci_exp_set_dev_base(addr, k, PCI_DEVFN(i, 0));
+ if (addr == 0 ||
+ readl((u32 __iomem *)mmcfg_virt_addr) != val1) {
+ set_bit(i, fallback_slots);
+ printk(KERN_NOTICE
+ "PCI: No mmconfig possible on %x:%x\n", k, i);
+ }
+ spin_unlock_irqrestore(&pci_config_lock, flags);
+ }
}
}
@@ -183,6 +196,14 @@ void __init pci_mmcfg_init(void)
(pci_mmcfg_config[0].base_address == 0))
return;
+ if (!e820_all_mapped(pci_mmcfg_config[0].base_address,
+ pci_mmcfg_config[0].base_address + MMCONFIG_APER_SIZE,
+ E820_RESERVED)) {
+ printk(KERN_ERR "PCI: BIOS Bug: MCFG area is not E820-reserved\n");
+ printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
+ return;
+ }
+
printk(KERN_INFO "PCI: Using MMCONFIG\n");
raw_pci_ops = &pci_mmcfg;
pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index d1e2fc56648..648047a0bce 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -194,6 +194,9 @@ endchoice
endmenu
+config ARCH_SELECT_MEMORY_MODEL
+ def_bool y
+
config ARCH_SPARSEMEM_ENABLE
def_bool y
diff --git a/arch/sparc64/defconfig b/arch/sparc64/defconfig
index 30389085a35..1317380fa93 100644
--- a/arch/sparc64/defconfig
+++ b/arch/sparc64/defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
# Linux kernel version: 2.6.16
-# Fri Mar 31 01:40:57 2006
+# Sun Apr 2 19:31:04 2006
#
CONFIG_SPARC=y
CONFIG_SPARC64=y
@@ -838,7 +838,6 @@ CONFIG_FB_TILEBLITTING=y
# CONFIG_FB_NVIDIA is not set
# CONFIG_FB_RIVA is not set
# CONFIG_FB_MATROX is not set
-# CONFIG_FB_RADEON_OLD is not set
CONFIG_FB_RADEON=y
CONFIG_FB_RADEON_I2C=y
# CONFIG_FB_RADEON_DEBUG is not set
@@ -924,6 +923,7 @@ CONFIG_SND_MTPAV=m
# PCI devices
#
# CONFIG_SND_AD1889 is not set
+# CONFIG_SND_ALS300 is not set
CONFIG_SND_ALI5451=m
# CONFIG_SND_ATIIXP is not set
# CONFIG_SND_ATIIXP_MODEM is not set
@@ -955,6 +955,7 @@ CONFIG_SND_ALI5451=m
# CONFIG_SND_MIXART is not set
# CONFIG_SND_NM256 is not set
# CONFIG_SND_PCXHR is not set
+# CONFIG_SND_RIPTIDE is not set
# CONFIG_SND_RME32 is not set
# CONFIG_SND_RME96 is not set
# CONFIG_SND_RME9652 is not set
@@ -1109,6 +1110,11 @@ CONFIG_USB_HIDDEV=y
# CONFIG_MMC is not set
#
+# LED devices
+#
+# CONFIG_NEW_LEDS is not set
+
+#
# InfiniBand support
#
# CONFIG_INFINIBAND is not set
diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c
index eb93e9c5284..49e6dedd027 100644
--- a/arch/sparc64/kernel/ptrace.c
+++ b/arch/sparc64/kernel/ptrace.c
@@ -244,6 +244,13 @@ asmlinkage void do_ptrace(struct pt_regs *regs)
}
switch(request) {
+ case PTRACE_PEEKUSR:
+ if (addr != 0)
+ pt_error_return(regs, EIO);
+ else
+ pt_succ_return(regs, 0);
+ goto out_tsk;
+
case PTRACE_PEEKTEXT: /* read word at location addr. */
case PTRACE_PEEKDATA: {
unsigned long tmp64;
@@ -602,6 +609,22 @@ asmlinkage void do_ptrace(struct pt_regs *regs)
/* PTRACE_DUMPCORE unsupported... */
+ case PTRACE_GETEVENTMSG: {
+ int err;
+
+ if (test_thread_flag(TIF_32BIT))
+ err = put_user(child->ptrace_message,
+ (unsigned int __user *) data);
+ else
+ err = put_user(child->ptrace_message,
+ (unsigned long __user *) data);
+ if (err)
+ pt_error_return(regs, -err);
+ else
+ pt_succ_return(regs, 0);
+ break;
+ }
+
default: {
int err = ptrace_request(child, request, addr, data);
if (err)
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 8175a6968c6..eb36f7988ff 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -745,12 +745,21 @@ struct call_data_struct {
int wait;
};
-static DEFINE_SPINLOCK(call_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock);
static struct call_data_struct *call_data;
extern unsigned long xcall_call_function;
-/*
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: currently unused.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Does not return until
+ * remote CPUs are nearly ready to execute <<func>> or are or have executed.
+ *
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler.
*/
@@ -759,7 +768,6 @@ static int smp_call_function_mask(void (*func)(void *info), void *info,
{
struct call_data_struct data;
int cpus;
- long timeout;
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
@@ -777,31 +785,18 @@ static int smp_call_function_mask(void (*func)(void *info), void *info,
goto out_unlock;
call_data = &data;
+ mb();
smp_cross_call_masked(&xcall_call_function, 0, 0, 0, mask);
- /*
- * Wait for other cpus to complete function or at
- * least snap the call data.
- */
- timeout = 1000000;
- while (atomic_read(&data.finished) != cpus) {
- if (--timeout <= 0)
- goto out_timeout;
- barrier();
- udelay(1);
- }
+ /* Wait for response */
+ while (atomic_read(&data.finished) != cpus)
+ cpu_relax();
out_unlock:
spin_unlock(&call_lock);
return 0;
-
-out_timeout:
- spin_unlock(&call_lock);
- printk("XCALL: Remote cpus not responding, ncpus=%d finished=%d\n",
- cpus, atomic_read(&data.finished));
- return 0;
}
int smp_call_function(void (*func)(void *info), void *info,
diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
index ff090bb9734..2793a5d8238 100644
--- a/arch/sparc64/kernel/traps.c
+++ b/arch/sparc64/kernel/traps.c
@@ -1130,9 +1130,9 @@ static void cheetah_log_errors(struct pt_regs *regs, struct cheetah_err_info *in
(recoverable ? KERN_WARNING : KERN_CRIT), smp_processor_id(),
afsr, afar,
(afsr & CHAFSR_TL1) ? 1 : 0);
- printk("%s" "ERROR(%d): TPC[%016lx] TNPC[%016lx] TSTATE[%016lx]\n",
+ printk("%s" "ERROR(%d): TPC[%lx] TNPC[%lx] O7[%lx] TSTATE[%lx]\n",
(recoverable ? KERN_WARNING : KERN_CRIT), smp_processor_id(),
- regs->tpc, regs->tnpc, regs->tstate);
+ regs->tpc, regs->tnpc, regs->u_regs[UREG_I7], regs->tstate);
printk("%s" "ERROR(%d): M_SYND(%lx), E_SYND(%lx)%s%s\n",
(recoverable ? KERN_WARNING : KERN_CRIT), smp_processor_id(),
(afsr & CHAFSR_M_SYNDROME) >> CHAFSR_M_SYNDROME_SHIFT,
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 4310b4a311a..7df2fe1844b 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -136,6 +136,11 @@ config X86_L1_CACHE_SHIFT
default "7" if GENERIC_CPU || MPSC
default "6" if MK8
+config X86_INTERNODE_CACHE_BYTES
+ int
+ default "4096" if X86_VSMP
+ default X86_L1_CACHE_BYTES if !X86_VSMP
+
config X86_TSC
bool
default y
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 585fd4a559c..e573e2ab551 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -24,37 +24,37 @@
LDFLAGS := -m elf_x86_64
OBJCOPYFLAGS := -O binary -R .note -R .comment -S
LDFLAGS_vmlinux :=
-
CHECKFLAGS += -D__x86_64__ -m64
+cflags-y :=
cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
-CFLAGS += $(cflags-y)
-CFLAGS += -m64
-CFLAGS += -mno-red-zone
-CFLAGS += -mcmodel=kernel
-CFLAGS += -pipe
+cflags-y += -m64
+cflags-y += -mno-red-zone
+cflags-y += -mcmodel=kernel
+cflags-y += -pipe
cflags-$(CONFIG_REORDER) += -ffunction-sections
# this makes reading assembly source easier, but produces worse code
# actually it makes the kernel smaller too.
-CFLAGS += -fno-reorder-blocks
-CFLAGS += -Wno-sign-compare
+cflags-y += -fno-reorder-blocks
+cflags-y += -Wno-sign-compare
ifneq ($(CONFIG_UNWIND_INFO),y)
-CFLAGS += -fno-asynchronous-unwind-tables
+cflags-y += -fno-asynchronous-unwind-tables
endif
ifneq ($(CONFIG_DEBUG_INFO),y)
# -fweb shrinks the kernel a bit, but the difference is very small
# it also messes up debugging, so don't use it for now.
-#CFLAGS += $(call cc-option,-fweb)
+#cflags-y += $(call cc-option,-fweb)
endif
# -funit-at-a-time shrinks the kernel .text considerably
# unfortunately it makes reading oopses harder.
-CFLAGS += $(call cc-option,-funit-at-a-time)
+cflags-y += $(call cc-option,-funit-at-a-time)
# prevent gcc from generating any FP code by mistake
-CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
+cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
+CFLAGS += $(cflags-y)
AFLAGS += -m64
head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 566ecc97ee5..3c45ec22b3f 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.16-git9
-# Sat Mar 25 15:18:40 2006
+# Linux kernel version: 2.6.17-rc1
+# Mon Apr 3 16:11:14 2006
#
CONFIG_X86_64=y
CONFIG_64BIT=y
@@ -9,6 +9,7 @@ CONFIG_X86=y
CONFIG_SEMAPHORE_SLEEPERS=y
CONFIG_MMU=y
CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_HWEIGHT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
CONFIG_X86_CMPXCHG=y
CONFIG_EARLY_PRINTK=y
@@ -55,10 +56,6 @@ CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
CONFIG_EPOLL=y
CONFIG_SHMEM=y
-CONFIG_CC_ALIGN_FUNCTIONS=0
-CONFIG_CC_ALIGN_LABELS=0
-CONFIG_CC_ALIGN_LOOPS=0
-CONFIG_CC_ALIGN_JUMPS=0
CONFIG_SLAB=y
# CONFIG_TINY_SHMEM is not set
CONFIG_BASE_SMALL=0
@@ -70,7 +67,6 @@ CONFIG_BASE_SMALL=0
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
-CONFIG_OBSOLETE_MODPARM=y
# CONFIG_MODVERSIONS is not set
# CONFIG_MODULE_SRCVERSION_ALL is not set
# CONFIG_KMOD is not set
@@ -81,6 +77,7 @@ CONFIG_STOP_MACHINE=y
#
CONFIG_LBD=y
# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_LSF is not set
#
# IO Schedulers
@@ -105,6 +102,7 @@ CONFIG_X86_PC=y
CONFIG_GENERIC_CPU=y
CONFIG_X86_L1_CACHE_BYTES=128
CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_INTERNODE_CACHE_BYTES=128
CONFIG_X86_TSC=y
CONFIG_X86_GOOD_APIC=y
# CONFIG_MICROCODE is not set
@@ -116,6 +114,7 @@ CONFIG_X86_LOCAL_APIC=y
CONFIG_MTRR=y
CONFIG_SMP=y
CONFIG_SCHED_SMT=y
+CONFIG_SCHED_MC=y
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
# CONFIG_PREEMPT is not set
@@ -138,6 +137,7 @@ CONFIG_NEED_MULTIPLE_NODES=y
CONFIG_SPLIT_PTLOCK_CPUS=4
CONFIG_MIGRATION=y
CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
+CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y
CONFIG_NR_CPUS=32
CONFIG_HOTPLUG_CPU=y
CONFIG_HPET_TIMER=y
@@ -289,6 +289,7 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_INET_AH is not set
# CONFIG_INET_ESP is not set
# CONFIG_INET_IPCOMP is not set
+# CONFIG_INET_XFRM_TUNNEL is not set
# CONFIG_INET_TUNNEL is not set
CONFIG_INET_DIAG=y
CONFIG_INET_TCP_DIAG=y
@@ -300,6 +301,7 @@ CONFIG_IPV6=y
# CONFIG_INET6_AH is not set
# CONFIG_INET6_ESP is not set
# CONFIG_INET6_IPCOMP is not set
+# CONFIG_INET6_XFRM_TUNNEL is not set
# CONFIG_INET6_TUNNEL is not set
# CONFIG_IPV6_TUNNEL is not set
# CONFIG_NETFILTER is not set
@@ -704,7 +706,6 @@ CONFIG_S2IO=m
# Wireless LAN (non-hamradio)
#
# CONFIG_NET_RADIO is not set
-# CONFIG_NET_WIRELESS_RTNETLINK is not set
#
# Wan interfaces
@@ -791,7 +792,7 @@ CONFIG_HW_CONSOLE=y
#
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
-# CONFIG_SERIAL_8250_ACPI is not set
+CONFIG_SERIAL_8250_PCI=y
CONFIG_SERIAL_8250_NR_UARTS=4
CONFIG_SERIAL_8250_RUNTIME_UARTS=4
# CONFIG_SERIAL_8250_EXTENDED is not set
@@ -921,6 +922,7 @@ CONFIG_HWMON=y
# Digital Video Broadcasting Devices
#
# CONFIG_DVB is not set
+# CONFIG_USB_DABUSB is not set
#
# Graphics support
@@ -932,6 +934,8 @@ CONFIG_VIDEO_SELECT=y
# Console display driver support
#
CONFIG_VGA_CONSOLE=y
+CONFIG_VGACON_SOFT_SCROLLBACK=y
+CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256
CONFIG_DUMMY_CONSOLE=y
#
@@ -1058,15 +1062,6 @@ CONFIG_USB_HIDINPUT=y
# CONFIG_USB_MICROTEK is not set
#
-# USB Multimedia devices
-#
-# CONFIG_USB_DABUSB is not set
-
-#
-# Video4Linux support is needed for USB Multimedia device support
-#
-
-#
# USB Network Adapters
#
# CONFIG_USB_CATC is not set
@@ -1118,9 +1113,15 @@ CONFIG_USB_MON=y
# CONFIG_MMC is not set
#
+# LED devices
+#
+# CONFIG_NEW_LEDS is not set
+
+#
# InfiniBand support
#
# CONFIG_INFINIBAND is not set
+# CONFIG_IPATH_CORE is not set
#
# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
@@ -1128,6 +1129,11 @@ CONFIG_USB_MON=y
# CONFIG_EDAC is not set
#
+# Real Time Clock
+#
+# CONFIG_RTC_CLASS is not set
+
+#
# Firmware Drivers
#
# CONFIG_EDD is not set
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 35b2faccdc6..5a980267668 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -15,6 +15,8 @@
#include <asm/vsyscall32.h>
#include <linux/linkage.h>
+#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
+
.macro IA32_ARG_FIXUP noebp=0
movl %edi,%r8d
.if \noebp
@@ -109,8 +111,8 @@ ENTRY(ia32_sysenter_target)
CFI_REMEMBER_STATE
jnz sysenter_tracesys
sysenter_do_call:
- cmpl $(IA32_NR_syscalls),%eax
- jae ia32_badsys
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja ia32_badsys
IA32_ARG_FIXUP 1
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
@@ -210,8 +212,8 @@ ENTRY(ia32_cstar_target)
CFI_REMEMBER_STATE
jnz cstar_tracesys
cstar_do_call:
- cmpl $IA32_NR_syscalls,%eax
- jae ia32_badsys
+ cmpl $IA32_NR_syscalls-1,%eax
+ ja ia32_badsys
IA32_ARG_FIXUP 1
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
@@ -296,8 +298,8 @@ ENTRY(ia32_syscall)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
jnz ia32_tracesys
ia32_do_syscall:
- cmpl $(IA32_NR_syscalls),%eax
- jae ia32_badsys
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja ia32_badsys
IA32_ARG_FIXUP
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
@@ -685,12 +687,11 @@ ia32_sys_call_table:
.quad sys_readlinkat /* 305 */
.quad sys_fchmodat
.quad sys_faccessat
- .quad sys_ni_syscall /* pselect6 for now */
- .quad sys_ni_syscall /* ppoll for now */
+ .quad quiet_ni_syscall /* pselect6 for now */
+ .quad quiet_ni_syscall /* ppoll for now */
.quad sys_unshare /* 310 */
.quad compat_sys_set_robust_list
.quad compat_sys_get_robust_list
+ .quad sys_splice
+ .quad sys_sync_file_range
ia32_syscall_end:
- .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
- .quad ni_syscall
- .endr
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index fffd6b0a2fa..70b9d21ed67 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -80,7 +80,7 @@ static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size)
printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
return 0;
}
- if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+ if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
return 0;
}
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 293cd71a266..62776c07cff 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -80,7 +80,12 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
return 0;
}
-int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int __meminit
+e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
@@ -94,6 +99,35 @@ int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
return 0;
}
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
+{
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ if (type && ei->type != type)
+ continue;
+ /* is the region (part) in overlap with the current region ?*/
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+
+ /* if the region is at the beginning of <start,end> we move
+ * start to the end of the region since it's ok until there
+ */
+ if (ei->addr <= start)
+ start = ei->addr + ei->size;
+ /* if start is now at or beyond end, we're done, full coverage */
+ if (start >= end)
+ return 1; /* we're done */
+ }
+ return 0;
+}
+
/*
* Find a free area in a specific range.
*/
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 8538bfea30e..c946e4fe67a 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -180,6 +180,10 @@ rff_trace:
*
* XXX if we had a free scratch register we could save the RSP into the stack frame
* and report it properly in ps. Unfortunately we haven't.
+ *
+ * When user can change the frames always force IRET. That is because
+ * it deals with uncanonical addresses better. SYSRET has trouble
+ * with them due to bugs in both AMD and Intel CPUs.
*/
ENTRY(system_call)
@@ -254,7 +258,10 @@ sysret_signal:
xorl %esi,%esi # oldset -> arg2
call ptregscall_common
1: movl $_TIF_NEED_RESCHED,%edi
- jmp sysret_check
+ /* Use IRET because user could have changed frame. This
+ works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
+ cli
+ jmp int_with_check
badsys:
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -280,7 +287,8 @@ tracesys:
call syscall_trace_leave
RESTORE_TOP_OF_STACK %rbx
RESTORE_REST
- jmp ret_from_sys_call
+ /* Use IRET because user could have changed frame */
+ jmp int_ret_from_sys_call
CFI_ENDPROC
/*
@@ -408,25 +416,9 @@ ENTRY(stub_execve)
CFI_ADJUST_CFA_OFFSET -8
CFI_REGISTER rip, r11
SAVE_REST
- movq %r11, %r15
- CFI_REGISTER rip, r15
FIXUP_TOP_OF_STACK %r11
call sys_execve
- GET_THREAD_INFO(%rcx)
- bt $TIF_IA32,threadinfo_flags(%rcx)
- CFI_REMEMBER_STATE
- jc exec_32bit
RESTORE_TOP_OF_STACK %r11
- movq %r15, %r11
- CFI_REGISTER rip, r11
- RESTORE_REST
- pushq %r11
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rip, 0
- ret
-
-exec_32bit:
- CFI_RESTORE_STATE
movq %rax,RAX(%rsp)
RESTORE_REST
jmp int_ret_from_sys_call
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 10b3e348fc9..6f0790e8b6d 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -29,6 +29,8 @@
#define MISC_MCELOG_MINOR 227
#define NR_BANKS 6
+atomic_t mce_entry;
+
static int mce_dont_init;
/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
@@ -172,10 +174,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
int i;
int panicm_found = 0;
+ atomic_inc(&mce_entry);
+
if (regs)
notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
if (!banks)
- return;
+ goto out2;
memset(&m, 0, sizeof(struct mce));
m.cpu = safe_smp_processor_id();
@@ -266,6 +270,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
out:
/* Last thing done in the machine check exception to clear state. */
wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ out2:
+ atomic_dec(&mce_entry);
}
/*
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index d9e4067faf0..4e6357fe0ec 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -34,6 +34,7 @@
#include <asm/proto.h>
#include <asm/kdebug.h>
#include <asm/local.h>
+#include <asm/mce.h>
/*
* lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -480,6 +481,12 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
__get_cpu_var(nmi_touch) = 0;
touched = 1;
}
+#ifdef CONFIG_X86_MCE
+ /* Could check oops_in_progress here too, but it's safer
+ not too */
+ if (atomic_read(&mce_entry) > 0)
+ touched = 1;
+#endif
if (!touched && __get_cpu_var(last_irq_sum) == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 03c9eeedb0f..af035ede70c 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -48,9 +48,11 @@ dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
{
struct page *page;
int node;
+#ifdef CONFIG_PCI
if (dev->bus == &pci_bus_type)
node = pcibus_to_node(to_pci_dev(dev)->bus);
else
+#endif
node = numa_node_id();
page = alloc_pages_node(node, gfp, order);
return page ? page_address(page) : NULL;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 70dd8e5c688..1c44b53cb15 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -781,10 +781,16 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
}
case ARCH_GET_GS: {
unsigned long base;
+ unsigned gsindex;
if (task->thread.gsindex == GS_TLS_SEL)
base = read_32bit_tls(task, GS_TLS);
- else if (doit)
- rdmsrl(MSR_KERNEL_GS_BASE, base);
+ else if (doit) {
+ asm("movl %%gs,%0" : "=r" (gsindex));
+ if (gsindex)
+ rdmsrl(MSR_KERNEL_GS_BASE, base);
+ else
+ base = task->thread.gs;
+ }
else
base = task->thread.gs;
ret = put_user(base, (unsigned long __user *)addr);
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 0856ad444f9..c50b06765a8 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -353,8 +353,10 @@ static __init void parse_cmdline_early (char ** cmdline_p)
if (fullarg(from, "enable_timer_pin_1"))
disable_timer_pin_1 = -1;
- if (fullarg(from, "nolapic") || fullarg(from, "disableapic"))
+ if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
disable_apic = 1;
+ }
if (fullarg(from, "noapic"))
skip_ioapic_setup = 1;
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index ef8bc46dc14..7392570f975 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -726,7 +726,7 @@ static __init int late_hpet_init(void)
unsigned int ntimer;
if (!vxtime.hpet_address)
- return -1;
+ return 0;
memset(&hd, 0, sizeof (hd));
@@ -917,6 +917,8 @@ void __init time_init(void)
vxtime.hpet_address = 0;
if (hpet_use_timer) {
+ /* set tick_nsec to use the proper rate for HPET */
+ tick_nsec = TICK_NSEC_HPET;
cpu_khz = hpet_calibrate_tsc();
timename = "HPET";
#ifdef CONFIG_X86_PM_TIMER
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 39ff0708f80..b81f473c4a1 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -65,7 +65,7 @@ SECTIONS
.data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
*(.data.cacheline_aligned)
}
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
*(.data.read_mostly)
}
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index d78f46056bd..fec4e521c01 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -112,7 +112,6 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback);
#undef memcpy
#undef memset
#undef memmove
-#undef strlen
extern void * memset(void *,int,__kernel_size_t);
extern size_t strlen(const char *);
@@ -121,7 +120,6 @@ extern void * memcpy(void *,const void *,__kernel_size_t);
extern void * __memcpy(void *,const void *,__kernel_size_t);
EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(strlen);
EXPORT_SYMBOL(strpbrk);
EXPORT_SYMBOL(memmove);
EXPORT_SYMBOL(memcpy);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e5f7f1c3446..4ba34e95d83 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -305,7 +305,7 @@ static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned
if (paddr >= end)
break;
- if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
+ if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) {
set_pud(pud, __pud(0));
continue;
}
@@ -507,9 +507,8 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
/*
* Memory hotplug specific functions
- * These are only for non-NUMA machines right now.
*/
-#ifdef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
void online_page(struct page *page)
{
@@ -520,6 +519,38 @@ void online_page(struct page *page)
num_physpages++;
}
+#ifndef CONFIG_MEMORY_HOTPLUG
+/*
+ * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
+ * just online the pages.
+ */
+int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+{
+ int err = -EIO;
+ unsigned long pfn;
+ unsigned long total = 0, mem = 0;
+ for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+ if (pfn_valid(pfn)) {
+ online_page(pfn_to_page(pfn));
+ err = 0;
+ mem++;
+ }
+ total++;
+ }
+ if (!err) {
+ z->spanned_pages += total;
+ z->present_pages += mem;
+ z->zone_pgdat->node_spanned_pages += total;
+ z->zone_pgdat->node_present_pages += mem;
+ }
+ return err;
+}
+#endif
+
+/*
+ * Memory is added always to NORMAL zone. This means you will never get
+ * additional DMA/DMA32 memory.
+ */
int add_memory(u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(0);
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 4be82d6e2b4..cc02573a327 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -100,11 +100,30 @@ int early_pfn_to_nid(unsigned long pfn)
}
#endif
+static void * __init
+early_node_mem(int nodeid, unsigned long start, unsigned long end,
+ unsigned long size)
+{
+ unsigned long mem = find_e820_area(start, end, size);
+ void *ptr;
+ if (mem != -1L)
+ return __va(mem);
+ ptr = __alloc_bootmem_nopanic(size,
+ SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
+ if (ptr == 0) {
+ printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
+ size, nodeid);
+ return NULL;
+ }
+ return ptr;
+}
+
/* Initialize bootmem allocator for a node */
void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
{
unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
unsigned long nodedata_phys;
+ void *bootmap;
const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
start = round_up(start, ZONE_ALIGN);
@@ -114,13 +133,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
start_pfn = start >> PAGE_SHIFT;
end_pfn = end >> PAGE_SHIFT;
- nodedata_phys = find_e820_area(start, end, pgdat_size);
- if (nodedata_phys == -1L)
- panic("Cannot find memory pgdat in node %d\n", nodeid);
-
- Dprintk("nodedata_phys %lx\n", nodedata_phys);
+ node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
+ if (node_data[nodeid] == NULL)
+ return;
+ nodedata_phys = __pa(node_data[nodeid]);
- node_data[nodeid] = phys_to_virt(nodedata_phys);
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
@@ -129,9 +146,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
/* Find a place for the bootmem map */
bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
- bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
- if (bootmap_start == -1L)
- panic("Not enough continuous space for bootmap on node %d", nodeid);
+ bootmap = early_node_mem(nodeid, bootmap_start, end,
+ bootmap_pages<<PAGE_SHIFT);
+ if (bootmap == NULL) {
+ if (nodedata_phys < start || nodedata_phys >= end)
+ free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
+ node_data[nodeid] = NULL;
+ return;
+ }
+ bootmap_start = __pa(bootmap);
Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
@@ -142,6 +165,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+#ifdef CONFIG_ACPI_NUMA
+ srat_reserve_add_area(nodeid);
+#endif
node_set_online(nodeid);
}
@@ -335,6 +361,8 @@ __init int numa_setup(char *opt)
#ifdef CONFIG_ACPI_NUMA
if (!strncmp(opt,"noacpi",6))
acpi_numa = -1;
+ if (!strncmp(opt,"hotadd=", 7))
+ hotadd_percent = simple_strtoul(opt+7, NULL, 10);
#endif
return 1;
}
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 2eb879590dc..15ae9fcd65a 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -15,15 +15,26 @@
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
#include <asm/proto.h>
#include <asm/numa.h>
#include <asm/e820.h>
+#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
+ defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
+ && !defined(CONFIG_MEMORY_HOTPLUG)
+#define RESERVE_HOTADD 1
+#endif
+
static struct acpi_table_slit *acpi_slit;
static nodemask_t nodes_parsed __initdata;
static nodemask_t nodes_found __initdata;
static struct bootnode nodes[MAX_NUMNODES] __initdata;
+static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
+static int found_add_area __initdata;
+int hotadd_percent __initdata = 10;
static u8 pxm2node[256] = { [0 ... 255] = 0xff };
/* Too small nodes confuse the VM badly. Usually they result
@@ -71,6 +82,10 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end)
static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
struct bootnode *nd = &nodes[i];
+
+ if (found_add_area)
+ return;
+
if (nd->start < start) {
nd->start = start;
if (nd->end < nd->start)
@@ -90,6 +105,8 @@ static __init void bad_srat(void)
acpi_numa = -1;
for (i = 0; i < MAX_LOCAL_APIC; i++)
apicid_to_node[i] = NUMA_NO_NODE;
+ for (i = 0; i < MAX_NUMNODES; i++)
+ nodes_add[i].start = nodes[i].end = 0;
}
static __init inline int srat_disabled(void)
@@ -155,11 +172,114 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
pxm, pa->apic_id, node);
}
+#ifdef RESERVE_HOTADD
+/*
+ * Protect against too large hotadd areas that would fill up memory.
+ */
+static int hotadd_enough_memory(struct bootnode *nd)
+{
+ static unsigned long allocated;
+ static unsigned long last_area_end;
+ unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
+ long mem = pages * sizeof(struct page);
+ unsigned long addr;
+ unsigned long allowed;
+ unsigned long oldpages = pages;
+
+ if (mem < 0)
+ return 0;
+ allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
+ allowed = (allowed / 100) * hotadd_percent;
+ if (allocated + mem > allowed) {
+ /* Give them at least part of their hotadd memory upto hotadd_percent
+ It would be better to spread the limit out
+ over multiple hotplug areas, but that is too complicated
+ right now */
+ if (allocated >= allowed)
+ return 0;
+ pages = (allowed - allocated + mem) / sizeof(struct page);
+ mem = pages * sizeof(struct page);
+ nd->end = nd->start + pages*PAGE_SIZE;
+ }
+ /* Not completely fool proof, but a good sanity check */
+ addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
+ if (addr == -1UL)
+ return 0;
+ if (pages != oldpages)
+ printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
+ pages << PAGE_SHIFT);
+ last_area_end = addr + mem;
+ allocated += mem;
+ return 1;
+}
+
+/*
+ * It is fine to add this area to the nodes data it will be used later
+ * This code supports one contigious hot add area per node.
+ */
+static int reserve_hotadd(int node, unsigned long start, unsigned long end)
+{
+ unsigned long s_pfn = start >> PAGE_SHIFT;
+ unsigned long e_pfn = end >> PAGE_SHIFT;
+ int changed = 0;
+ struct bootnode *nd = &nodes_add[node];
+
+ /* I had some trouble with strange memory hotadd regions breaking
+ the boot. Be very strict here and reject anything unexpected.
+ If you want working memory hotadd write correct SRATs.
+
+ The node size check is a basic sanity check to guard against
+ mistakes */
+ if ((signed long)(end - start) < NODE_MIN_SIZE) {
+ printk(KERN_ERR "SRAT: Hotplug area too small\n");
+ return -1;
+ }
+
+ /* This check might be a bit too strict, but I'm keeping it for now. */
+ if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
+ printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
+ return -1;
+ }
+
+ if (!hotadd_enough_memory(&nodes_add[node])) {
+ printk(KERN_ERR "SRAT: Hotplug area too large\n");
+ return -1;
+ }
+
+ /* Looks good */
+
+ found_add_area = 1;
+ if (nd->start == nd->end) {
+ nd->start = start;
+ nd->end = end;
+ changed = 1;
+ } else {
+ if (nd->start == end) {
+ nd->start = start;
+ changed = 1;
+ }
+ if (nd->end == start) {
+ nd->end = end;
+ changed = 1;
+ }
+ if (!changed)
+ printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
+ }
+
+ if ((nd->end >> PAGE_SHIFT) > end_pfn)
+ end_pfn = nd->end >> PAGE_SHIFT;
+
+ if (changed)
+ printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
+ return 0;
+}
+#endif
+
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
{
- struct bootnode *nd;
+ struct bootnode *nd, oldnode;
unsigned long start, end;
int node, pxm;
int i;
@@ -172,6 +292,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
}
if (ma->flags.enabled == 0)
return;
+ if (ma->flags.hot_pluggable && hotadd_percent == 0)
+ return;
start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
pxm = ma->proximity_domain;
@@ -181,10 +303,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
bad_srat();
return;
}
- /* It is fine to add this area to the nodes data it will be used later*/
- if (ma->flags.hot_pluggable == 1)
- printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n",
- start, end);
i = conflicting_nodes(start, end);
if (i == node) {
printk(KERN_WARNING
@@ -199,6 +317,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
return;
}
nd = &nodes[node];
+ oldnode = *nd;
if (!node_test_and_set(node, nodes_parsed)) {
nd->start = start;
nd->end = end;
@@ -208,8 +327,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
if (nd->end < end)
nd->end = end;
}
+
printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
nd->start, nd->end);
+
+#ifdef RESERVE_HOTADD
+ if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
+ /* Ignore hotadd region. Undo damage */
+ printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+ *nd = oldnode;
+ if ((nd->start | nd->end) == 0)
+ node_clear(node, nodes_parsed);
+ }
+#endif
}
/* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -225,6 +355,9 @@ static int nodes_cover_memory(void)
unsigned long e = nodes[i].end >> PAGE_SHIFT;
pxmram += e - s;
pxmram -= e820_hole_size(s, e);
+ pxmram -= nodes_add[i].end - nodes_add[i].start;
+ if ((long)pxmram < 0)
+ pxmram = 0;
}
e820ram = end_pfn - e820_hole_size(0, end_pfn);
@@ -258,7 +391,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
/* First clean up the node list */
for (i = 0; i < MAX_NUMNODES; i++) {
- cutoff_node(i, start, end);
+ cutoff_node(i, start, end);
if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
unparse_node(i);
}
@@ -282,6 +415,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
/* Finally register nodes */
for_each_node_mask(i, nodes_parsed)
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ /* Try again in case setup_node_bootmem missed one due
+ to missing bootmem */
+ for_each_node_mask(i, nodes_parsed)
+ if (!node_online(i))
+ setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+
for (i = 0; i < NR_CPUS; i++) {
if (cpu_to_node[i] == NUMA_NO_NODE)
continue;
@@ -303,6 +442,25 @@ static int node_to_pxm(int n)
return 0;
}
+void __init srat_reserve_add_area(int nodeid)
+{
+ if (found_add_area && nodes_add[nodeid].end) {
+ u64 total_mb;
+
+ printk(KERN_INFO "SRAT: Reserving hot-add memory space "
+ "for node %d at %Lx-%Lx\n",
+ nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
+ total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
+ >> PAGE_SHIFT;
+ total_mb *= sizeof(struct page);
+ total_mb >>= 20;
+ printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
+ "pre-allocated memory.\n", (unsigned long long)total_mb);
+ reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
+ nodes_add[nodeid].end - nodes_add[nodeid].start);
+ }
+}
+
int __node_distance(int a, int b)
{
int index;
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index e616500207e..b493ed977e7 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -9,11 +9,16 @@
#include <linux/init.h>
#include <linux/acpi.h>
#include <linux/bitmap.h>
+#include <asm/e820.h>
+
#include "pci.h"
#define MMCONFIG_APER_SIZE (256*1024*1024)
+/* Verify the first 16 busses. We assume that systems with more busses
+ get MCFG right. */
+#define MAX_CHECK_BUS 16
-static DECLARE_BITMAP(fallback_slots, 32);
+static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS);
/* Static virtual mapping of the MMCONFIG aperture */
struct mmcfg_virt {
@@ -55,7 +60,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus)
static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
{
char __iomem *addr;
- if (seg == 0 && bus == 0 && test_bit(PCI_SLOT(devfn), fallback_slots))
+ if (seg == 0 && bus < MAX_CHECK_BUS &&
+ test_bit(32*bus + PCI_SLOT(devfn), fallback_slots))
return NULL;
addr = get_virt(seg, bus);
if (!addr)
@@ -69,8 +75,10 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
char __iomem *addr;
/* Why do we have this when nobody checks it. How about a BUG()!? -AK */
- if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095)))
+ if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095))) {
+ *value = -1;
return -EINVAL;
+ }
addr = pci_dev_base(seg, bus, devfn);
if (!addr)
@@ -129,21 +137,26 @@ static struct pci_raw_ops pci_mmcfg = {
Normally this can be expressed in the MCFG by not listing them
and assigning suitable _SEGs, but this isn't implemented in some BIOS.
Instead try to discover all devices on bus 0 that are unreachable using MM
- and fallback for them.
- We only do this for bus 0/seg 0 */
+ and fallback for them. */
static __init void unreachable_devices(void)
{
- int i;
- for (i = 0; i < 32; i++) {
- u32 val1;
- char __iomem *addr;
-
- pci_conf1_read(0, 0, PCI_DEVFN(i,0), 0, 4, &val1);
- if (val1 == 0xffffffff)
- continue;
- addr = pci_dev_base(0, 0, PCI_DEVFN(i, 0));
- if (addr == NULL|| readl(addr) != val1) {
- set_bit(i, fallback_slots);
+ int i, k;
+ /* Use the max bus number from ACPI here? */
+ for (k = 0; i < MAX_CHECK_BUS; k++) {
+ for (i = 0; i < 32; i++) {
+ u32 val1;
+ char __iomem *addr;
+
+ pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
+ if (val1 == 0xffffffff)
+ continue;
+ addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
+ if (addr == NULL|| readl(addr) != val1) {
+ set_bit(i + 32*k, fallback_slots);
+ printk(KERN_NOTICE
+ "PCI: No mmconfig possible on device %x:%x\n",
+ k, i);
+ }
}
}
}
@@ -161,6 +174,14 @@ void __init pci_mmcfg_init(void)
(pci_mmcfg_config[0].base_address == 0))
return;
+ if (!e820_all_mapped(pci_mmcfg_config[0].base_address,
+ pci_mmcfg_config[0].base_address + MMCONFIG_APER_SIZE,
+ E820_RESERVED)) {
+ printk(KERN_ERR "PCI: BIOS Bug: MCFG area is not E820-reserved\n");
+ printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
+ return;
+ }
+
/* RED-PEN i386 doesn't do _nocache right now */
pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
if (pci_mmcfg_virt == NULL) {