c22ce143d1
Use the x86 cache-bypassing copy instructions for copy_from_user(). Some performance data are Total of GLOBAL_POWER_EVENTS (CPU cycle samples) 2.6.12.4.orig 1921587 2.6.12.4.nt 1599424 1599424/1921587=83.23% (16.77% reduction) BSQ_CACHE_REFERENCE (L3 cache miss) 2.6.12.4.orig 57427 2.6.12.4.nt 20858 20858/57427=36.32% (63.7% reduction) L3 cache miss reduction of __copy_from_user_ll samples % 37408 65.1412 vmlinux __copy_from_user_ll 23 0.1103 vmlinux __copy_user_zeroing_intel_nocache 23/37408=0.061% (99.94% reduction) Top 5 of 2.6.12.4.nt Counted GLOBAL_POWER_EVENTS events (time during which processor is not stopped) with a unit mask of 0x01 (mandatory) count 100000 samples % app name symbol name 128392 8.0274 vmlinux __copy_user_zeroing_intel_nocache 64206 4.0143 vmlinux journal_add_journal_head 59746 3.7355 vmlinux do_get_write_access 47674 2.9807 vmlinux journal_put_journal_head 46021 2.8774 vmlinux journal_dirty_metadata pattern9-0-cpu4-0-09011728/summary.out Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x3f (multiple flags) count 3000 samples % app name symbol name 69755 4.2861 vmlinux __copy_user_zeroing_intel_nocache 55685 3.4215 vmlinux journal_add_journal_head 52371 3.2179 vmlinux __find_get_block 45504 2.7960 vmlinux journal_put_journal_head 36005 2.2123 vmlinux journal_stop pattern9-0-cpu4-0-09011744/summary.out Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x200 (read 3rd level cache miss) count 3000 samples % app name symbol name 1147 5.4994 vmlinux journal_add_journal_head 881 4.2240 vmlinux journal_dirty_data 872 4.1809 vmlinux blk_rq_map_sg 734 3.5192 vmlinux journal_commit_transaction 617 2.9582 vmlinux radix_tree_delete pattern9-0-cpu4-0-09011731/summary.out iozone results are original 2.6.12.4 CPU time = 207.768 sec cache aware CPU time = 184.783 sec (three times run) 184.783/207.768=88.94% (11.06% reduction) original: pattern9-0-cpu4-0-08191720/iozone.out: CPU Utilization: Wall time 45.997 CPU time 64.527 CPU utilization 140.28 % pattern9-0-cpu4-0-08191741/iozone.out: CPU Utilization: Wall time 46.878 CPU time 71.933 CPU utilization 153.45 % pattern9-0-cpu4-0-08191743/iozone.out: CPU Utilization: Wall time 45.152 CPU time 71.308 CPU utilization 157.93 % cache awre: pattern9-0-cpu4-0-09011728/iozone.out: CPU Utilization: Wall time 44.842 CPU time 62.465 CPU utilization 139.30 % pattern9-0-cpu4-0-09011731/iozone.out: CPU Utilization: Wall time 44.718 CPU time 59.273 CPU utilization 132.55 % pattern9-0-cpu4-0-09011744/iozone.out: CPU Utilization: Wall time 44.367 CPU time 63.045 CPU utilization 142.10 % Signed-off-by: Hiro Yoshioka <hyoshiok@miraclelinux.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> |
||
---|---|---|
.. | ||
mach-bigsmp | ||
mach-default | ||
mach-es7000 | ||
mach-generic | ||
mach-numaq | ||
mach-summit | ||
mach-visws | ||
mach-voyager | ||
8253pit.h | ||
a.out.h | ||
acpi.h | ||
agp.h | ||
alternative.h | ||
apic.h | ||
apicdef.h | ||
arch_hooks.h | ||
atomic.h | ||
auxvec.h | ||
bitops.h | ||
boot.h | ||
bug.h | ||
bugs.h | ||
byteorder.h | ||
cache.h | ||
cacheflush.h | ||
checksum.h | ||
cpu.h | ||
cpufeature.h | ||
cputime.h | ||
current.h | ||
debugreg.h | ||
delay.h | ||
desc.h | ||
div64.h | ||
dma-mapping.h | ||
dma.h | ||
dmi.h | ||
e820.h | ||
edac.h | ||
elf.h | ||
emergency-restart.h | ||
errno.h | ||
fcntl.h | ||
fixmap.h | ||
floppy.h | ||
futex.h | ||
genapic.h | ||
hardirq.h | ||
highmem.h | ||
hpet.h | ||
hw_irq.h | ||
i387.h | ||
i8253.h | ||
i8259.h | ||
ide.h | ||
io_apic.h | ||
io.h | ||
ioctl.h | ||
ioctls.h | ||
ipc.h | ||
ipcbuf.h | ||
irq.h | ||
ist.h | ||
kdebug.h | ||
kexec.h | ||
kmap_types.h | ||
kprobes.h | ||
ldt.h | ||
linkage.h | ||
local.h | ||
math_emu.h | ||
mc146818rtc.h | ||
mca_dma.h | ||
mca.h | ||
mman.h | ||
mmu_context.h | ||
mmu.h | ||
mmx.h | ||
mmzone.h | ||
module.h | ||
mpspec_def.h | ||
mpspec.h | ||
msgbuf.h | ||
msi.h | ||
msr.h | ||
mtrr.h | ||
mutex.h | ||
namei.h | ||
nmi.h | ||
node.h | ||
numa.h | ||
numaq.h | ||
page.h | ||
param.h | ||
parport.h | ||
pci-direct.h | ||
pci.h | ||
percpu.h | ||
pgalloc.h | ||
pgtable-2level-defs.h | ||
pgtable-2level.h | ||
pgtable-3level-defs.h | ||
pgtable-3level.h | ||
pgtable.h | ||
poll.h | ||
posix_types.h | ||
processor.h | ||
ptrace.h | ||
resource.h | ||
rtc.h | ||
rwlock.h | ||
rwsem.h | ||
scatterlist.h | ||
seccomp.h | ||
sections.h | ||
segment.h | ||
semaphore.h | ||
sembuf.h | ||
serial.h | ||
setup.h | ||
shmbuf.h | ||
shmparam.h | ||
sigcontext.h | ||
siginfo.h | ||
signal.h | ||
smp.h | ||
socket.h | ||
sockios.h | ||
sparsemem.h | ||
spinlock_types.h | ||
spinlock.h | ||
srat.h | ||
stat.h | ||
statfs.h | ||
string.h | ||
suspend.h | ||
system.h | ||
termbits.h | ||
termios.h | ||
thread_info.h | ||
timer.h | ||
timex.h | ||
tlb.h | ||
tlbflush.h | ||
topology.h | ||
types.h | ||
uaccess.h | ||
ucontext.h | ||
unaligned.h | ||
unistd.h | ||
user.h | ||
vga.h | ||
vic.h | ||
vm86.h | ||
voyager.h | ||
xor.h |