c22ce143d1
Use the x86 cache-bypassing copy instructions for copy_from_user(). Some performance data are Total of GLOBAL_POWER_EVENTS (CPU cycle samples) 2.6.12.4.orig 1921587 2.6.12.4.nt 1599424 1599424/1921587=83.23% (16.77% reduction) BSQ_CACHE_REFERENCE (L3 cache miss) 2.6.12.4.orig 57427 2.6.12.4.nt 20858 20858/57427=36.32% (63.7% reduction) L3 cache miss reduction of __copy_from_user_ll samples % 37408 65.1412 vmlinux __copy_from_user_ll 23 0.1103 vmlinux __copy_user_zeroing_intel_nocache 23/37408=0.061% (99.94% reduction) Top 5 of 2.6.12.4.nt Counted GLOBAL_POWER_EVENTS events (time during which processor is not stopped) with a unit mask of 0x01 (mandatory) count 100000 samples % app name symbol name 128392 8.0274 vmlinux __copy_user_zeroing_intel_nocache 64206 4.0143 vmlinux journal_add_journal_head 59746 3.7355 vmlinux do_get_write_access 47674 2.9807 vmlinux journal_put_journal_head 46021 2.8774 vmlinux journal_dirty_metadata pattern9-0-cpu4-0-09011728/summary.out Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x3f (multiple flags) count 3000 samples % app name symbol name 69755 4.2861 vmlinux __copy_user_zeroing_intel_nocache 55685 3.4215 vmlinux journal_add_journal_head 52371 3.2179 vmlinux __find_get_block 45504 2.7960 vmlinux journal_put_journal_head 36005 2.2123 vmlinux journal_stop pattern9-0-cpu4-0-09011744/summary.out Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x200 (read 3rd level cache miss) count 3000 samples % app name symbol name 1147 5.4994 vmlinux journal_add_journal_head 881 4.2240 vmlinux journal_dirty_data 872 4.1809 vmlinux blk_rq_map_sg 734 3.5192 vmlinux journal_commit_transaction 617 2.9582 vmlinux radix_tree_delete pattern9-0-cpu4-0-09011731/summary.out iozone results are original 2.6.12.4 CPU time = 207.768 sec cache aware CPU time = 184.783 sec (three times run) 184.783/207.768=88.94% (11.06% reduction) original: pattern9-0-cpu4-0-08191720/iozone.out: CPU Utilization: Wall time 45.997 CPU time 64.527 CPU utilization 140.28 % pattern9-0-cpu4-0-08191741/iozone.out: CPU Utilization: Wall time 46.878 CPU time 71.933 CPU utilization 153.45 % pattern9-0-cpu4-0-08191743/iozone.out: CPU Utilization: Wall time 45.152 CPU time 71.308 CPU utilization 157.93 % cache awre: pattern9-0-cpu4-0-09011728/iozone.out: CPU Utilization: Wall time 44.842 CPU time 62.465 CPU utilization 139.30 % pattern9-0-cpu4-0-09011731/iozone.out: CPU Utilization: Wall time 44.718 CPU time 59.273 CPU utilization 132.55 % pattern9-0-cpu4-0-09011744/iozone.out: CPU Utilization: Wall time 44.367 CPU time 63.045 CPU utilization 142.10 % Signed-off-by: Hiro Yoshioka <hyoshiok@miraclelinux.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
95 lines
2.2 KiB
C
95 lines
2.2 KiB
C
/*
|
|
* linux/mm/filemap.h
|
|
*
|
|
* Copyright (C) 1994-1999 Linus Torvalds
|
|
*/
|
|
|
|
#ifndef __FILEMAP_H
|
|
#define __FILEMAP_H
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/uio.h>
|
|
#include <linux/config.h>
|
|
#include <linux/uaccess.h>
|
|
|
|
size_t
|
|
__filemap_copy_from_user_iovec(char *vaddr,
|
|
const struct iovec *iov,
|
|
size_t base,
|
|
size_t bytes);
|
|
|
|
/*
|
|
* Copy as much as we can into the page and return the number of bytes which
|
|
* were sucessfully copied. If a fault is encountered then clear the page
|
|
* out to (offset+bytes) and return the number of bytes which were copied.
|
|
*/
|
|
static inline size_t
|
|
filemap_copy_from_user(struct page *page, unsigned long offset,
|
|
const char __user *buf, unsigned bytes)
|
|
{
|
|
char *kaddr;
|
|
int left;
|
|
|
|
kaddr = kmap_atomic(page, KM_USER0);
|
|
left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
|
|
kunmap_atomic(kaddr, KM_USER0);
|
|
|
|
if (left != 0) {
|
|
/* Do it the slow way */
|
|
kaddr = kmap(page);
|
|
left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
|
|
kunmap(page);
|
|
}
|
|
return bytes - left;
|
|
}
|
|
|
|
/*
|
|
* This has the same sideeffects and return value as filemap_copy_from_user().
|
|
* The difference is that on a fault we need to memset the remainder of the
|
|
* page (out to offset+bytes), to emulate filemap_copy_from_user()'s
|
|
* single-segment behaviour.
|
|
*/
|
|
static inline size_t
|
|
filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
|
|
const struct iovec *iov, size_t base, size_t bytes)
|
|
{
|
|
char *kaddr;
|
|
size_t copied;
|
|
|
|
kaddr = kmap_atomic(page, KM_USER0);
|
|
copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
|
|
base, bytes);
|
|
kunmap_atomic(kaddr, KM_USER0);
|
|
if (copied != bytes) {
|
|
kaddr = kmap(page);
|
|
copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
|
|
base, bytes);
|
|
kunmap(page);
|
|
}
|
|
return copied;
|
|
}
|
|
|
|
static inline void
|
|
filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
|
|
{
|
|
const struct iovec *iov = *iovp;
|
|
size_t base = *basep;
|
|
|
|
while (bytes) {
|
|
int copy = min(bytes, iov->iov_len - base);
|
|
|
|
bytes -= copy;
|
|
base += copy;
|
|
if (iov->iov_len == base) {
|
|
iov++;
|
|
base = 0;
|
|
}
|
|
}
|
|
*iovp = iov;
|
|
*basep = base;
|
|
}
|
|
#endif
|