Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block-related fixes from Jens Axboe: - Improvements to the buffered and direct write IO plugging from Fengguang. - Abstract out the mapping of a bio in a request, and use that to provide a blk_bio_map_sg() helper. Useful for mapping just a bio instead of a full request. - Regression fix from Hugh, fixing up a patch that went into the previous release cycle (and marked stable, too) attempting to prevent a loop in __getblk_slow(). - Updates to discard requests, fixing up the sizing and how we align them. Also a change to disallow merging of discard requests, since that doesn't really work properly yet. - A few drbd fixes. - Documentation updates. * 'for-linus' of git://git.kernel.dk/linux-block: block: replace __getblk_slow misfix by grow_dev_page fix drbd: Write all pages of the bitmap after an online resize drbd: Finish requests that completed while IO was frozen drbd: fix drbd wire compatibility for empty flushes Documentation: update tunable options in block/cfq-iosched.txt Documentation: update tunable options in block/cfq-iosched.txt Documentation: update missing index files in block/00-INDEX block: move down direct IO plugging block: remove plugging at buffered write time block: disable discard request merge temporarily bio: Fix potential memory leak in bio_find_or_create_slab() block: Don't use static to define "void *p" in show_partition_start() block: Add blk_bio_map_sg() helper block: Introduce __blk_segment_map_sg() helper fs/block-dev.c:fix performance regression in O_DIRECT writes to md block devices block: split discard into aligned requests block: reorganize rounding of max_discard_sectors
This commit is contained in:
commit
a7e546f175
|
@ -3,15 +3,21 @@
|
||||||
biodoc.txt
|
biodoc.txt
|
||||||
- Notes on the Generic Block Layer Rewrite in Linux 2.5
|
- Notes on the Generic Block Layer Rewrite in Linux 2.5
|
||||||
capability.txt
|
capability.txt
|
||||||
- Generic Block Device Capability (/sys/block/<disk>/capability)
|
- Generic Block Device Capability (/sys/block/<device>/capability)
|
||||||
|
cfq-iosched.txt
|
||||||
|
- CFQ IO scheduler tunables
|
||||||
|
data-integrity.txt
|
||||||
|
- Block data integrity
|
||||||
deadline-iosched.txt
|
deadline-iosched.txt
|
||||||
- Deadline IO scheduler tunables
|
- Deadline IO scheduler tunables
|
||||||
ioprio.txt
|
ioprio.txt
|
||||||
- Block io priorities (in CFQ scheduler)
|
- Block io priorities (in CFQ scheduler)
|
||||||
|
queue-sysfs.txt
|
||||||
|
- Queue's sysfs entries
|
||||||
request.txt
|
request.txt
|
||||||
- The members of struct request (in include/linux/blkdev.h)
|
- The members of struct request (in include/linux/blkdev.h)
|
||||||
stat.txt
|
stat.txt
|
||||||
- Block layer statistics in /sys/block/<dev>/stat
|
- Block layer statistics in /sys/block/<device>/stat
|
||||||
switching-sched.txt
|
switching-sched.txt
|
||||||
- Switching I/O schedulers at runtime
|
- Switching I/O schedulers at runtime
|
||||||
writeback_cache_control.txt
|
writeback_cache_control.txt
|
||||||
|
|
|
@ -1,3 +1,14 @@
|
||||||
|
CFQ (Complete Fairness Queueing)
|
||||||
|
===============================
|
||||||
|
|
||||||
|
The main aim of CFQ scheduler is to provide a fair allocation of the disk
|
||||||
|
I/O bandwidth for all the processes which requests an I/O operation.
|
||||||
|
|
||||||
|
CFQ maintains the per process queue for the processes which request I/O
|
||||||
|
operation(syncronous requests). In case of asynchronous requests, all the
|
||||||
|
requests from all the processes are batched together according to their
|
||||||
|
process's I/O priority.
|
||||||
|
|
||||||
CFQ ioscheduler tunables
|
CFQ ioscheduler tunables
|
||||||
========================
|
========================
|
||||||
|
|
||||||
|
@ -25,6 +36,72 @@ there are multiple spindles behind single LUN (Host based hardware RAID
|
||||||
controller or for storage arrays), setting slice_idle=0 might end up in better
|
controller or for storage arrays), setting slice_idle=0 might end up in better
|
||||||
throughput and acceptable latencies.
|
throughput and acceptable latencies.
|
||||||
|
|
||||||
|
back_seek_max
|
||||||
|
-------------
|
||||||
|
This specifies, given in Kbytes, the maximum "distance" for backward seeking.
|
||||||
|
The distance is the amount of space from the current head location to the
|
||||||
|
sectors that are backward in terms of distance.
|
||||||
|
|
||||||
|
This parameter allows the scheduler to anticipate requests in the "backward"
|
||||||
|
direction and consider them as being the "next" if they are within this
|
||||||
|
distance from the current head location.
|
||||||
|
|
||||||
|
back_seek_penalty
|
||||||
|
-----------------
|
||||||
|
This parameter is used to compute the cost of backward seeking. If the
|
||||||
|
backward distance of request is just 1/back_seek_penalty from a "front"
|
||||||
|
request, then the seeking cost of two requests is considered equivalent.
|
||||||
|
|
||||||
|
So scheduler will not bias toward one or the other request (otherwise scheduler
|
||||||
|
will bias toward front request). Default value of back_seek_penalty is 2.
|
||||||
|
|
||||||
|
fifo_expire_async
|
||||||
|
-----------------
|
||||||
|
This parameter is used to set the timeout of asynchronous requests. Default
|
||||||
|
value of this is 248ms.
|
||||||
|
|
||||||
|
fifo_expire_sync
|
||||||
|
----------------
|
||||||
|
This parameter is used to set the timeout of synchronous requests. Default
|
||||||
|
value of this is 124ms. In case to favor synchronous requests over asynchronous
|
||||||
|
one, this value should be decreased relative to fifo_expire_async.
|
||||||
|
|
||||||
|
slice_async
|
||||||
|
-----------
|
||||||
|
This parameter is same as of slice_sync but for asynchronous queue. The
|
||||||
|
default value is 40ms.
|
||||||
|
|
||||||
|
slice_async_rq
|
||||||
|
--------------
|
||||||
|
This parameter is used to limit the dispatching of asynchronous request to
|
||||||
|
device request queue in queue's slice time. The maximum number of request that
|
||||||
|
are allowed to be dispatched also depends upon the io priority. Default value
|
||||||
|
for this is 2.
|
||||||
|
|
||||||
|
slice_sync
|
||||||
|
----------
|
||||||
|
When a queue is selected for execution, the queues IO requests are only
|
||||||
|
executed for a certain amount of time(time_slice) before switching to another
|
||||||
|
queue. This parameter is used to calculate the time slice of synchronous
|
||||||
|
queue.
|
||||||
|
|
||||||
|
time_slice is computed using the below equation:-
|
||||||
|
time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the
|
||||||
|
time_slice of synchronous queue, increase the value of slice_sync. Default
|
||||||
|
value is 100ms.
|
||||||
|
|
||||||
|
quantum
|
||||||
|
-------
|
||||||
|
This specifies the number of request dispatched to the device queue. In a
|
||||||
|
queue's time slice, a request will not be dispatched if the number of request
|
||||||
|
in the device exceeds this parameter. This parameter is used for synchronous
|
||||||
|
request.
|
||||||
|
|
||||||
|
In case of storage with several disk, this setting can limit the parallel
|
||||||
|
processing of request. Therefore, increasing the value can imporve the
|
||||||
|
performace although this can cause the latency of some I/O to increase due
|
||||||
|
to more number of requests.
|
||||||
|
|
||||||
CFQ IOPS Mode for group scheduling
|
CFQ IOPS Mode for group scheduling
|
||||||
===================================
|
===================================
|
||||||
Basic CFQ design is to provide priority based time slices. Higher priority
|
Basic CFQ design is to provide priority based time slices. Higher priority
|
||||||
|
|
|
@ -9,20 +9,71 @@ These files are the ones found in the /sys/block/xxx/queue/ directory.
|
||||||
Files denoted with a RO postfix are readonly and the RW postfix means
|
Files denoted with a RO postfix are readonly and the RW postfix means
|
||||||
read-write.
|
read-write.
|
||||||
|
|
||||||
|
add_random (RW)
|
||||||
|
----------------
|
||||||
|
This file allows to trun off the disk entropy contribution. Default
|
||||||
|
value of this file is '1'(on).
|
||||||
|
|
||||||
|
discard_granularity (RO)
|
||||||
|
-----------------------
|
||||||
|
This shows the size of internal allocation of the device in bytes, if
|
||||||
|
reported by the device. A value of '0' means device does not support
|
||||||
|
the discard functionality.
|
||||||
|
|
||||||
|
discard_max_bytes (RO)
|
||||||
|
----------------------
|
||||||
|
Devices that support discard functionality may have internal limits on
|
||||||
|
the number of bytes that can be trimmed or unmapped in a single operation.
|
||||||
|
The discard_max_bytes parameter is set by the device driver to the maximum
|
||||||
|
number of bytes that can be discarded in a single operation. Discard
|
||||||
|
requests issued to the device must not exceed this limit. A discard_max_bytes
|
||||||
|
value of 0 means that the device does not support discard functionality.
|
||||||
|
|
||||||
|
discard_zeroes_data (RO)
|
||||||
|
------------------------
|
||||||
|
When read, this file will show if the discarded block are zeroed by the
|
||||||
|
device or not. If its value is '1' the blocks are zeroed otherwise not.
|
||||||
|
|
||||||
hw_sector_size (RO)
|
hw_sector_size (RO)
|
||||||
-------------------
|
-------------------
|
||||||
This is the hardware sector size of the device, in bytes.
|
This is the hardware sector size of the device, in bytes.
|
||||||
|
|
||||||
|
iostats (RW)
|
||||||
|
-------------
|
||||||
|
This file is used to control (on/off) the iostats accounting of the
|
||||||
|
disk.
|
||||||
|
|
||||||
|
logical_block_size (RO)
|
||||||
|
-----------------------
|
||||||
|
This is the logcal block size of the device, in bytes.
|
||||||
|
|
||||||
max_hw_sectors_kb (RO)
|
max_hw_sectors_kb (RO)
|
||||||
----------------------
|
----------------------
|
||||||
This is the maximum number of kilobytes supported in a single data transfer.
|
This is the maximum number of kilobytes supported in a single data transfer.
|
||||||
|
|
||||||
|
max_integrity_segments (RO)
|
||||||
|
---------------------------
|
||||||
|
When read, this file shows the max limit of integrity segments as
|
||||||
|
set by block layer which a hardware controller can handle.
|
||||||
|
|
||||||
max_sectors_kb (RW)
|
max_sectors_kb (RW)
|
||||||
-------------------
|
-------------------
|
||||||
This is the maximum number of kilobytes that the block layer will allow
|
This is the maximum number of kilobytes that the block layer will allow
|
||||||
for a filesystem request. Must be smaller than or equal to the maximum
|
for a filesystem request. Must be smaller than or equal to the maximum
|
||||||
size allowed by the hardware.
|
size allowed by the hardware.
|
||||||
|
|
||||||
|
max_segments (RO)
|
||||||
|
-----------------
|
||||||
|
Maximum number of segments of the device.
|
||||||
|
|
||||||
|
max_segment_size (RO)
|
||||||
|
---------------------
|
||||||
|
Maximum segment size of the device.
|
||||||
|
|
||||||
|
minimum_io_size (RO)
|
||||||
|
--------------------
|
||||||
|
This is the smallest preferred io size reported by the device.
|
||||||
|
|
||||||
nomerges (RW)
|
nomerges (RW)
|
||||||
-------------
|
-------------
|
||||||
This enables the user to disable the lookup logic involved with IO
|
This enables the user to disable the lookup logic involved with IO
|
||||||
|
@ -45,11 +96,24 @@ per-block-cgroup request pool. IOW, if there are N block cgroups,
|
||||||
each request queue may have upto N request pools, each independently
|
each request queue may have upto N request pools, each independently
|
||||||
regulated by nr_requests.
|
regulated by nr_requests.
|
||||||
|
|
||||||
|
optimal_io_size (RO)
|
||||||
|
--------------------
|
||||||
|
This is the optimal io size reported by the device.
|
||||||
|
|
||||||
|
physical_block_size (RO)
|
||||||
|
------------------------
|
||||||
|
This is the physical block size of device, in bytes.
|
||||||
|
|
||||||
read_ahead_kb (RW)
|
read_ahead_kb (RW)
|
||||||
------------------
|
------------------
|
||||||
Maximum number of kilobytes to read-ahead for filesystems on this block
|
Maximum number of kilobytes to read-ahead for filesystems on this block
|
||||||
device.
|
device.
|
||||||
|
|
||||||
|
rotational (RW)
|
||||||
|
---------------
|
||||||
|
This file is used to stat if the device is of rotational type or
|
||||||
|
non-rotational type.
|
||||||
|
|
||||||
rq_affinity (RW)
|
rq_affinity (RW)
|
||||||
----------------
|
----------------
|
||||||
If this option is '1', the block layer will migrate request completions to the
|
If this option is '1', the block layer will migrate request completions to the
|
||||||
|
|
|
@ -44,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
|
||||||
struct request_queue *q = bdev_get_queue(bdev);
|
struct request_queue *q = bdev_get_queue(bdev);
|
||||||
int type = REQ_WRITE | REQ_DISCARD;
|
int type = REQ_WRITE | REQ_DISCARD;
|
||||||
unsigned int max_discard_sectors;
|
unsigned int max_discard_sectors;
|
||||||
|
unsigned int granularity, alignment, mask;
|
||||||
struct bio_batch bb;
|
struct bio_batch bb;
|
||||||
struct bio *bio;
|
struct bio *bio;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
@ -54,18 +55,20 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
|
||||||
if (!blk_queue_discard(q))
|
if (!blk_queue_discard(q))
|
||||||
return -EOPNOTSUPP;
|
return -EOPNOTSUPP;
|
||||||
|
|
||||||
|
/* Zero-sector (unknown) and one-sector granularities are the same. */
|
||||||
|
granularity = max(q->limits.discard_granularity >> 9, 1U);
|
||||||
|
mask = granularity - 1;
|
||||||
|
alignment = (bdev_discard_alignment(bdev) >> 9) & mask;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Ensure that max_discard_sectors is of the proper
|
* Ensure that max_discard_sectors is of the proper
|
||||||
* granularity
|
* granularity, so that requests stay aligned after a split.
|
||||||
*/
|
*/
|
||||||
max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
|
max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
|
||||||
|
max_discard_sectors = round_down(max_discard_sectors, granularity);
|
||||||
if (unlikely(!max_discard_sectors)) {
|
if (unlikely(!max_discard_sectors)) {
|
||||||
/* Avoid infinite loop below. Being cautious never hurts. */
|
/* Avoid infinite loop below. Being cautious never hurts. */
|
||||||
return -EOPNOTSUPP;
|
return -EOPNOTSUPP;
|
||||||
} else if (q->limits.discard_granularity) {
|
|
||||||
unsigned int disc_sects = q->limits.discard_granularity >> 9;
|
|
||||||
|
|
||||||
max_discard_sectors &= ~(disc_sects - 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags & BLKDEV_DISCARD_SECURE) {
|
if (flags & BLKDEV_DISCARD_SECURE) {
|
||||||
|
@ -79,25 +82,37 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
|
||||||
bb.wait = &wait;
|
bb.wait = &wait;
|
||||||
|
|
||||||
while (nr_sects) {
|
while (nr_sects) {
|
||||||
|
unsigned int req_sects;
|
||||||
|
sector_t end_sect;
|
||||||
|
|
||||||
bio = bio_alloc(gfp_mask, 1);
|
bio = bio_alloc(gfp_mask, 1);
|
||||||
if (!bio) {
|
if (!bio) {
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If splitting a request, and the next starting sector would be
|
||||||
|
* misaligned, stop the discard at the previous aligned sector.
|
||||||
|
*/
|
||||||
|
end_sect = sector + req_sects;
|
||||||
|
if (req_sects < nr_sects && (end_sect & mask) != alignment) {
|
||||||
|
end_sect =
|
||||||
|
round_down(end_sect - alignment, granularity)
|
||||||
|
+ alignment;
|
||||||
|
req_sects = end_sect - sector;
|
||||||
|
}
|
||||||
|
|
||||||
bio->bi_sector = sector;
|
bio->bi_sector = sector;
|
||||||
bio->bi_end_io = bio_batch_end_io;
|
bio->bi_end_io = bio_batch_end_io;
|
||||||
bio->bi_bdev = bdev;
|
bio->bi_bdev = bdev;
|
||||||
bio->bi_private = &bb;
|
bio->bi_private = &bb;
|
||||||
|
|
||||||
if (nr_sects > max_discard_sectors) {
|
bio->bi_size = req_sects << 9;
|
||||||
bio->bi_size = max_discard_sectors << 9;
|
nr_sects -= req_sects;
|
||||||
nr_sects -= max_discard_sectors;
|
sector = end_sect;
|
||||||
sector += max_discard_sectors;
|
|
||||||
} else {
|
|
||||||
bio->bi_size = nr_sects << 9;
|
|
||||||
nr_sects = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
atomic_inc(&bb.done);
|
atomic_inc(&bb.done);
|
||||||
submit_bio(type, bio);
|
submit_bio(type, bio);
|
||||||
|
|
|
@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
|
||||||
|
struct scatterlist *sglist, struct bio_vec **bvprv,
|
||||||
|
struct scatterlist **sg, int *nsegs, int *cluster)
|
||||||
|
{
|
||||||
|
|
||||||
|
int nbytes = bvec->bv_len;
|
||||||
|
|
||||||
|
if (*bvprv && *cluster) {
|
||||||
|
if ((*sg)->length + nbytes > queue_max_segment_size(q))
|
||||||
|
goto new_segment;
|
||||||
|
|
||||||
|
if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
|
||||||
|
goto new_segment;
|
||||||
|
if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
|
||||||
|
goto new_segment;
|
||||||
|
|
||||||
|
(*sg)->length += nbytes;
|
||||||
|
} else {
|
||||||
|
new_segment:
|
||||||
|
if (!*sg)
|
||||||
|
*sg = sglist;
|
||||||
|
else {
|
||||||
|
/*
|
||||||
|
* If the driver previously mapped a shorter
|
||||||
|
* list, we could see a termination bit
|
||||||
|
* prematurely unless it fully inits the sg
|
||||||
|
* table on each mapping. We KNOW that there
|
||||||
|
* must be more entries here or the driver
|
||||||
|
* would be buggy, so force clear the
|
||||||
|
* termination bit to avoid doing a full
|
||||||
|
* sg_init_table() in drivers for each command.
|
||||||
|
*/
|
||||||
|
(*sg)->page_link &= ~0x02;
|
||||||
|
*sg = sg_next(*sg);
|
||||||
|
}
|
||||||
|
|
||||||
|
sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
|
||||||
|
(*nsegs)++;
|
||||||
|
}
|
||||||
|
*bvprv = bvec;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* map a request to scatterlist, return number of sg entries setup. Caller
|
* map a request to scatterlist, return number of sg entries setup. Caller
|
||||||
* must make sure sg can hold rq->nr_phys_segments entries
|
* must make sure sg can hold rq->nr_phys_segments entries
|
||||||
|
@ -131,41 +174,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
|
||||||
bvprv = NULL;
|
bvprv = NULL;
|
||||||
sg = NULL;
|
sg = NULL;
|
||||||
rq_for_each_segment(bvec, rq, iter) {
|
rq_for_each_segment(bvec, rq, iter) {
|
||||||
int nbytes = bvec->bv_len;
|
__blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
|
||||||
|
&nsegs, &cluster);
|
||||||
if (bvprv && cluster) {
|
|
||||||
if (sg->length + nbytes > queue_max_segment_size(q))
|
|
||||||
goto new_segment;
|
|
||||||
|
|
||||||
if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
|
|
||||||
goto new_segment;
|
|
||||||
if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
|
|
||||||
goto new_segment;
|
|
||||||
|
|
||||||
sg->length += nbytes;
|
|
||||||
} else {
|
|
||||||
new_segment:
|
|
||||||
if (!sg)
|
|
||||||
sg = sglist;
|
|
||||||
else {
|
|
||||||
/*
|
|
||||||
* If the driver previously mapped a shorter
|
|
||||||
* list, we could see a termination bit
|
|
||||||
* prematurely unless it fully inits the sg
|
|
||||||
* table on each mapping. We KNOW that there
|
|
||||||
* must be more entries here or the driver
|
|
||||||
* would be buggy, so force clear the
|
|
||||||
* termination bit to avoid doing a full
|
|
||||||
* sg_init_table() in drivers for each command.
|
|
||||||
*/
|
|
||||||
sg->page_link &= ~0x02;
|
|
||||||
sg = sg_next(sg);
|
|
||||||
}
|
|
||||||
|
|
||||||
sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
|
|
||||||
nsegs++;
|
|
||||||
}
|
|
||||||
bvprv = bvec;
|
|
||||||
} /* segments in rq */
|
} /* segments in rq */
|
||||||
|
|
||||||
|
|
||||||
|
@ -199,6 +209,43 @@ new_segment:
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(blk_rq_map_sg);
|
EXPORT_SYMBOL(blk_rq_map_sg);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* blk_bio_map_sg - map a bio to a scatterlist
|
||||||
|
* @q: request_queue in question
|
||||||
|
* @bio: bio being mapped
|
||||||
|
* @sglist: scatterlist being mapped
|
||||||
|
*
|
||||||
|
* Note:
|
||||||
|
* Caller must make sure sg can hold bio->bi_phys_segments entries
|
||||||
|
*
|
||||||
|
* Will return the number of sg entries setup
|
||||||
|
*/
|
||||||
|
int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
|
||||||
|
struct scatterlist *sglist)
|
||||||
|
{
|
||||||
|
struct bio_vec *bvec, *bvprv;
|
||||||
|
struct scatterlist *sg;
|
||||||
|
int nsegs, cluster;
|
||||||
|
unsigned long i;
|
||||||
|
|
||||||
|
nsegs = 0;
|
||||||
|
cluster = blk_queue_cluster(q);
|
||||||
|
|
||||||
|
bvprv = NULL;
|
||||||
|
sg = NULL;
|
||||||
|
bio_for_each_segment(bvec, bio, i) {
|
||||||
|
__blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
|
||||||
|
&nsegs, &cluster);
|
||||||
|
} /* segments in bio */
|
||||||
|
|
||||||
|
if (sg)
|
||||||
|
sg_mark_end(sg);
|
||||||
|
|
||||||
|
BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
|
||||||
|
return nsegs;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(blk_bio_map_sg);
|
||||||
|
|
||||||
static inline int ll_new_hw_segment(struct request_queue *q,
|
static inline int ll_new_hw_segment(struct request_queue *q,
|
||||||
struct request *req,
|
struct request *req,
|
||||||
struct bio *bio)
|
struct bio *bio)
|
||||||
|
|
|
@ -835,7 +835,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v)
|
||||||
|
|
||||||
static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
|
static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
|
||||||
{
|
{
|
||||||
static void *p;
|
void *p;
|
||||||
|
|
||||||
p = disk_seqf_start(seqf, pos);
|
p = disk_seqf_start(seqf, pos);
|
||||||
if (!IS_ERR_OR_NULL(p) && !*pos)
|
if (!IS_ERR_OR_NULL(p) && !*pos)
|
||||||
|
|
|
@ -889,6 +889,7 @@ struct bm_aio_ctx {
|
||||||
unsigned int done;
|
unsigned int done;
|
||||||
unsigned flags;
|
unsigned flags;
|
||||||
#define BM_AIO_COPY_PAGES 1
|
#define BM_AIO_COPY_PAGES 1
|
||||||
|
#define BM_WRITE_ALL_PAGES 2
|
||||||
int error;
|
int error;
|
||||||
struct kref kref;
|
struct kref kref;
|
||||||
};
|
};
|
||||||
|
@ -1059,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
|
||||||
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
|
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
|
||||||
break;
|
break;
|
||||||
if (rw & WRITE) {
|
if (rw & WRITE) {
|
||||||
if (bm_test_page_unchanged(b->bm_pages[i])) {
|
if (!(flags & BM_WRITE_ALL_PAGES) &&
|
||||||
|
bm_test_page_unchanged(b->bm_pages[i])) {
|
||||||
dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
|
dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1140,6 +1142,17 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
|
||||||
return bm_rw(mdev, WRITE, 0, 0);
|
return bm_rw(mdev, WRITE, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* drbd_bm_write_all() - Write the whole bitmap to its on disk location.
|
||||||
|
* @mdev: DRBD device.
|
||||||
|
*
|
||||||
|
* Will write all pages.
|
||||||
|
*/
|
||||||
|
int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local)
|
||||||
|
{
|
||||||
|
return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
|
* drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
|
||||||
* @mdev: DRBD device.
|
* @mdev: DRBD device.
|
||||||
|
|
|
@ -1469,6 +1469,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
|
||||||
extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
|
extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
|
||||||
extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
|
extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
|
||||||
extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
|
extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
|
||||||
|
extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
|
||||||
extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
|
extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
|
||||||
extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
|
extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
|
||||||
unsigned long al_enr);
|
unsigned long al_enr);
|
||||||
|
|
|
@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
|
||||||
static void md_sync_timer_fn(unsigned long data);
|
static void md_sync_timer_fn(unsigned long data);
|
||||||
static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
|
static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
|
||||||
static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
|
static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
|
||||||
|
static void _tl_clear(struct drbd_conf *mdev);
|
||||||
|
|
||||||
MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
|
MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
|
||||||
"Lars Ellenberg <lars@linbit.com>");
|
"Lars Ellenberg <lars@linbit.com>");
|
||||||
|
@ -432,19 +433,10 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
|
||||||
|
|
||||||
/* Actions operating on the disk state, also want to work on
|
/* Actions operating on the disk state, also want to work on
|
||||||
requests that got barrier acked. */
|
requests that got barrier acked. */
|
||||||
switch (what) {
|
|
||||||
case fail_frozen_disk_io:
|
|
||||||
case restart_frozen_disk_io:
|
|
||||||
list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
|
|
||||||
req = list_entry(le, struct drbd_request, tl_requests);
|
|
||||||
_req_mod(req, what);
|
|
||||||
}
|
|
||||||
|
|
||||||
case connection_lost_while_pending:
|
list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
|
||||||
case resend:
|
req = list_entry(le, struct drbd_request, tl_requests);
|
||||||
break;
|
_req_mod(req, what);
|
||||||
default:
|
|
||||||
dev_err(DEV, "what = %d in _tl_restart()\n", what);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -458,12 +450,17 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
|
||||||
* receiver thread and the worker thread.
|
* receiver thread and the worker thread.
|
||||||
*/
|
*/
|
||||||
void tl_clear(struct drbd_conf *mdev)
|
void tl_clear(struct drbd_conf *mdev)
|
||||||
|
{
|
||||||
|
spin_lock_irq(&mdev->req_lock);
|
||||||
|
_tl_clear(mdev);
|
||||||
|
spin_unlock_irq(&mdev->req_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void _tl_clear(struct drbd_conf *mdev)
|
||||||
{
|
{
|
||||||
struct list_head *le, *tle;
|
struct list_head *le, *tle;
|
||||||
struct drbd_request *r;
|
struct drbd_request *r;
|
||||||
|
|
||||||
spin_lock_irq(&mdev->req_lock);
|
|
||||||
|
|
||||||
_tl_restart(mdev, connection_lost_while_pending);
|
_tl_restart(mdev, connection_lost_while_pending);
|
||||||
|
|
||||||
/* we expect this list to be empty. */
|
/* we expect this list to be empty. */
|
||||||
|
@ -482,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev)
|
||||||
|
|
||||||
memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
|
memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
|
||||||
|
|
||||||
spin_unlock_irq(&mdev->req_lock);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
|
void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
|
||||||
|
@ -1476,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
|
||||||
if (ns.susp_fen) {
|
if (ns.susp_fen) {
|
||||||
/* case1: The outdate peer handler is successful: */
|
/* case1: The outdate peer handler is successful: */
|
||||||
if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
|
if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
|
||||||
tl_clear(mdev);
|
|
||||||
if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
|
if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
|
||||||
drbd_uuid_new_current(mdev);
|
drbd_uuid_new_current(mdev);
|
||||||
clear_bit(NEW_CUR_UUID, &mdev->flags);
|
clear_bit(NEW_CUR_UUID, &mdev->flags);
|
||||||
}
|
}
|
||||||
spin_lock_irq(&mdev->req_lock);
|
spin_lock_irq(&mdev->req_lock);
|
||||||
|
_tl_clear(mdev);
|
||||||
_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
|
_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
|
||||||
spin_unlock_irq(&mdev->req_lock);
|
spin_unlock_irq(&mdev->req_lock);
|
||||||
}
|
}
|
||||||
|
|
|
@ -674,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
|
||||||
la_size_changed && md_moved ? "size changed and md moved" :
|
la_size_changed && md_moved ? "size changed and md moved" :
|
||||||
la_size_changed ? "size changed" : "md moved");
|
la_size_changed ? "size changed" : "md moved");
|
||||||
/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
|
/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
|
||||||
err = drbd_bitmap_io(mdev, &drbd_bm_write,
|
err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
|
||||||
"size changed", BM_LOCKED_MASK);
|
"size changed", BM_LOCKED_MASK);
|
||||||
if (err) {
|
if (err) {
|
||||||
rv = dev_size_error;
|
rv = dev_size_error;
|
||||||
goto out;
|
goto out;
|
||||||
|
|
|
@ -695,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case resend:
|
case resend:
|
||||||
|
/* Simply complete (local only) READs. */
|
||||||
|
if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
|
||||||
|
_req_may_be_done(req, m);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
|
/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
|
||||||
before the connection loss (B&C only); only P_BARRIER_ACK was missing.
|
before the connection loss (B&C only); only P_BARRIER_ACK was missing.
|
||||||
Trowing them out of the TL here by pretending we got a BARRIER_ACK
|
Trowing them out of the TL here by pretending we got a BARRIER_ACK
|
||||||
|
@ -834,7 +840,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
|
||||||
req->private_bio = NULL;
|
req->private_bio = NULL;
|
||||||
}
|
}
|
||||||
if (rw == WRITE) {
|
if (rw == WRITE) {
|
||||||
remote = 1;
|
/* Need to replicate writes. Unless it is an empty flush,
|
||||||
|
* which is better mapped to a DRBD P_BARRIER packet,
|
||||||
|
* also for drbd wire protocol compatibility reasons. */
|
||||||
|
if (unlikely(size == 0)) {
|
||||||
|
/* The only size==0 bios we expect are empty flushes. */
|
||||||
|
D_ASSERT(bio->bi_rw & REQ_FLUSH);
|
||||||
|
remote = 0;
|
||||||
|
} else
|
||||||
|
remote = 1;
|
||||||
} else {
|
} else {
|
||||||
/* READ || READA */
|
/* READ || READA */
|
||||||
if (local) {
|
if (local) {
|
||||||
|
@ -870,8 +884,11 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
|
||||||
* extent. This waits for any resync activity in the corresponding
|
* extent. This waits for any resync activity in the corresponding
|
||||||
* resync extent to finish, and, if necessary, pulls in the target
|
* resync extent to finish, and, if necessary, pulls in the target
|
||||||
* extent into the activity log, which involves further disk io because
|
* extent into the activity log, which involves further disk io because
|
||||||
* of transactional on-disk meta data updates. */
|
* of transactional on-disk meta data updates.
|
||||||
if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
|
* Empty flushes don't need to go into the activity log, they can only
|
||||||
|
* flush data for pending writes which are already in there. */
|
||||||
|
if (rw == WRITE && local && size
|
||||||
|
&& !test_bit(AL_SUSPENDED, &mdev->flags)) {
|
||||||
req->rq_state |= RQ_IN_ACT_LOG;
|
req->rq_state |= RQ_IN_ACT_LOG;
|
||||||
drbd_al_begin_io(mdev, sector);
|
drbd_al_begin_io(mdev, sector);
|
||||||
}
|
}
|
||||||
|
@ -994,7 +1011,10 @@ allocate_barrier:
|
||||||
if (rw == WRITE && _req_conflicts(req))
|
if (rw == WRITE && _req_conflicts(req))
|
||||||
goto fail_conflicting;
|
goto fail_conflicting;
|
||||||
|
|
||||||
list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
|
/* no point in adding empty flushes to the transfer log,
|
||||||
|
* they are mapped to drbd barriers already. */
|
||||||
|
if (likely(size!=0))
|
||||||
|
list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
|
||||||
|
|
||||||
/* NOTE remote first: to get the concurrent write detection right,
|
/* NOTE remote first: to get the concurrent write detection right,
|
||||||
* we must register the request before start of local IO. */
|
* we must register the request before start of local IO. */
|
||||||
|
@ -1014,6 +1034,14 @@ allocate_barrier:
|
||||||
mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
|
mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
|
||||||
maybe_pull_ahead(mdev);
|
maybe_pull_ahead(mdev);
|
||||||
|
|
||||||
|
/* If this was a flush, queue a drbd barrier/start a new epoch.
|
||||||
|
* Unless the current epoch was empty anyways, or we are not currently
|
||||||
|
* replicating, in which case there is no point. */
|
||||||
|
if (unlikely(bio->bi_rw & REQ_FLUSH)
|
||||||
|
&& mdev->newest_tle->n_writes
|
||||||
|
&& drbd_should_do_remote(mdev->state))
|
||||||
|
queue_barrier(mdev);
|
||||||
|
|
||||||
spin_unlock_irq(&mdev->req_lock);
|
spin_unlock_irq(&mdev->req_lock);
|
||||||
kfree(b); /* if someone else has beaten us to it... */
|
kfree(b); /* if someone else has beaten us to it... */
|
||||||
|
|
||||||
|
|
11
fs/bio.c
11
fs/bio.c
|
@ -73,7 +73,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
|
||||||
{
|
{
|
||||||
unsigned int sz = sizeof(struct bio) + extra_size;
|
unsigned int sz = sizeof(struct bio) + extra_size;
|
||||||
struct kmem_cache *slab = NULL;
|
struct kmem_cache *slab = NULL;
|
||||||
struct bio_slab *bslab;
|
struct bio_slab *bslab, *new_bio_slabs;
|
||||||
unsigned int i, entry = -1;
|
unsigned int i, entry = -1;
|
||||||
|
|
||||||
mutex_lock(&bio_slab_lock);
|
mutex_lock(&bio_slab_lock);
|
||||||
|
@ -97,11 +97,12 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
|
||||||
|
|
||||||
if (bio_slab_nr == bio_slab_max && entry == -1) {
|
if (bio_slab_nr == bio_slab_max && entry == -1) {
|
||||||
bio_slab_max <<= 1;
|
bio_slab_max <<= 1;
|
||||||
bio_slabs = krealloc(bio_slabs,
|
new_bio_slabs = krealloc(bio_slabs,
|
||||||
bio_slab_max * sizeof(struct bio_slab),
|
bio_slab_max * sizeof(struct bio_slab),
|
||||||
GFP_KERNEL);
|
GFP_KERNEL);
|
||||||
if (!bio_slabs)
|
if (!new_bio_slabs)
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
|
bio_slabs = new_bio_slabs;
|
||||||
}
|
}
|
||||||
if (entry == -1)
|
if (entry == -1)
|
||||||
entry = bio_slab_nr++;
|
entry = bio_slab_nr++;
|
||||||
|
|
|
@ -1578,10 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
|
||||||
unsigned long nr_segs, loff_t pos)
|
unsigned long nr_segs, loff_t pos)
|
||||||
{
|
{
|
||||||
struct file *file = iocb->ki_filp;
|
struct file *file = iocb->ki_filp;
|
||||||
|
struct blk_plug plug;
|
||||||
ssize_t ret;
|
ssize_t ret;
|
||||||
|
|
||||||
BUG_ON(iocb->ki_pos != pos);
|
BUG_ON(iocb->ki_pos != pos);
|
||||||
|
|
||||||
|
blk_start_plug(&plug);
|
||||||
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
|
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
|
||||||
if (ret > 0 || ret == -EIOCBQUEUED) {
|
if (ret > 0 || ret == -EIOCBQUEUED) {
|
||||||
ssize_t err;
|
ssize_t err;
|
||||||
|
@ -1590,6 +1592,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
|
||||||
if (err < 0 && ret > 0)
|
if (err < 0 && ret > 0)
|
||||||
ret = err;
|
ret = err;
|
||||||
}
|
}
|
||||||
|
blk_finish_plug(&plug);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(blkdev_aio_write);
|
EXPORT_SYMBOL_GPL(blkdev_aio_write);
|
||||||
|
|
66
fs/buffer.c
66
fs/buffer.c
|
@ -914,7 +914,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
|
||||||
/*
|
/*
|
||||||
* Initialise the state of a blockdev page's buffers.
|
* Initialise the state of a blockdev page's buffers.
|
||||||
*/
|
*/
|
||||||
static void
|
static sector_t
|
||||||
init_page_buffers(struct page *page, struct block_device *bdev,
|
init_page_buffers(struct page *page, struct block_device *bdev,
|
||||||
sector_t block, int size)
|
sector_t block, int size)
|
||||||
{
|
{
|
||||||
|
@ -936,33 +936,41 @@ init_page_buffers(struct page *page, struct block_device *bdev,
|
||||||
block++;
|
block++;
|
||||||
bh = bh->b_this_page;
|
bh = bh->b_this_page;
|
||||||
} while (bh != head);
|
} while (bh != head);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Caller needs to validate requested block against end of device.
|
||||||
|
*/
|
||||||
|
return end_block;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create the page-cache page that contains the requested block.
|
* Create the page-cache page that contains the requested block.
|
||||||
*
|
*
|
||||||
* This is user purely for blockdev mappings.
|
* This is used purely for blockdev mappings.
|
||||||
*/
|
*/
|
||||||
static struct page *
|
static int
|
||||||
grow_dev_page(struct block_device *bdev, sector_t block,
|
grow_dev_page(struct block_device *bdev, sector_t block,
|
||||||
pgoff_t index, int size)
|
pgoff_t index, int size, int sizebits)
|
||||||
{
|
{
|
||||||
struct inode *inode = bdev->bd_inode;
|
struct inode *inode = bdev->bd_inode;
|
||||||
struct page *page;
|
struct page *page;
|
||||||
struct buffer_head *bh;
|
struct buffer_head *bh;
|
||||||
|
sector_t end_block;
|
||||||
|
int ret = 0; /* Will call free_more_memory() */
|
||||||
|
|
||||||
page = find_or_create_page(inode->i_mapping, index,
|
page = find_or_create_page(inode->i_mapping, index,
|
||||||
(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
|
(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
|
||||||
if (!page)
|
if (!page)
|
||||||
return NULL;
|
return ret;
|
||||||
|
|
||||||
BUG_ON(!PageLocked(page));
|
BUG_ON(!PageLocked(page));
|
||||||
|
|
||||||
if (page_has_buffers(page)) {
|
if (page_has_buffers(page)) {
|
||||||
bh = page_buffers(page);
|
bh = page_buffers(page);
|
||||||
if (bh->b_size == size) {
|
if (bh->b_size == size) {
|
||||||
init_page_buffers(page, bdev, block, size);
|
end_block = init_page_buffers(page, bdev,
|
||||||
return page;
|
index << sizebits, size);
|
||||||
|
goto done;
|
||||||
}
|
}
|
||||||
if (!try_to_free_buffers(page))
|
if (!try_to_free_buffers(page))
|
||||||
goto failed;
|
goto failed;
|
||||||
|
@ -982,14 +990,14 @@ grow_dev_page(struct block_device *bdev, sector_t block,
|
||||||
*/
|
*/
|
||||||
spin_lock(&inode->i_mapping->private_lock);
|
spin_lock(&inode->i_mapping->private_lock);
|
||||||
link_dev_buffers(page, bh);
|
link_dev_buffers(page, bh);
|
||||||
init_page_buffers(page, bdev, block, size);
|
end_block = init_page_buffers(page, bdev, index << sizebits, size);
|
||||||
spin_unlock(&inode->i_mapping->private_lock);
|
spin_unlock(&inode->i_mapping->private_lock);
|
||||||
return page;
|
done:
|
||||||
|
ret = (block < end_block) ? 1 : -ENXIO;
|
||||||
failed:
|
failed:
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
page_cache_release(page);
|
page_cache_release(page);
|
||||||
return NULL;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -999,7 +1007,6 @@ failed:
|
||||||
static int
|
static int
|
||||||
grow_buffers(struct block_device *bdev, sector_t block, int size)
|
grow_buffers(struct block_device *bdev, sector_t block, int size)
|
||||||
{
|
{
|
||||||
struct page *page;
|
|
||||||
pgoff_t index;
|
pgoff_t index;
|
||||||
int sizebits;
|
int sizebits;
|
||||||
|
|
||||||
|
@ -1023,22 +1030,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
|
||||||
bdevname(bdev, b));
|
bdevname(bdev, b));
|
||||||
return -EIO;
|
return -EIO;
|
||||||
}
|
}
|
||||||
block = index << sizebits;
|
|
||||||
/* Create a page with the proper size buffers.. */
|
/* Create a page with the proper size buffers.. */
|
||||||
page = grow_dev_page(bdev, block, index, size);
|
return grow_dev_page(bdev, block, index, size, sizebits);
|
||||||
if (!page)
|
|
||||||
return 0;
|
|
||||||
unlock_page(page);
|
|
||||||
page_cache_release(page);
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct buffer_head *
|
static struct buffer_head *
|
||||||
__getblk_slow(struct block_device *bdev, sector_t block, int size)
|
__getblk_slow(struct block_device *bdev, sector_t block, int size)
|
||||||
{
|
{
|
||||||
int ret;
|
|
||||||
struct buffer_head *bh;
|
|
||||||
|
|
||||||
/* Size must be multiple of hard sectorsize */
|
/* Size must be multiple of hard sectorsize */
|
||||||
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
|
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
|
||||||
(size < 512 || size > PAGE_SIZE))) {
|
(size < 512 || size > PAGE_SIZE))) {
|
||||||
|
@ -1051,21 +1050,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
retry:
|
for (;;) {
|
||||||
bh = __find_get_block(bdev, block, size);
|
struct buffer_head *bh;
|
||||||
if (bh)
|
int ret;
|
||||||
return bh;
|
|
||||||
|
|
||||||
ret = grow_buffers(bdev, block, size);
|
|
||||||
if (ret == 0) {
|
|
||||||
free_more_memory();
|
|
||||||
goto retry;
|
|
||||||
} else if (ret > 0) {
|
|
||||||
bh = __find_get_block(bdev, block, size);
|
bh = __find_get_block(bdev, block, size);
|
||||||
if (bh)
|
if (bh)
|
||||||
return bh;
|
return bh;
|
||||||
|
|
||||||
|
ret = grow_buffers(bdev, block, size);
|
||||||
|
if (ret < 0)
|
||||||
|
return NULL;
|
||||||
|
if (ret == 0)
|
||||||
|
free_more_memory();
|
||||||
}
|
}
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1321,10 +1319,6 @@ EXPORT_SYMBOL(__find_get_block);
|
||||||
* which corresponds to the passed block_device, block and size. The
|
* which corresponds to the passed block_device, block and size. The
|
||||||
* returned buffer has its reference count incremented.
|
* returned buffer has its reference count incremented.
|
||||||
*
|
*
|
||||||
* __getblk() cannot fail - it just keeps trying. If you pass it an
|
|
||||||
* illegal block number, __getblk() will happily return a buffer_head
|
|
||||||
* which represents the non-existent block. Very weird.
|
|
||||||
*
|
|
||||||
* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
|
* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
|
||||||
* attempt is failing. FIXME, perhaps?
|
* attempt is failing. FIXME, perhaps?
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -1062,6 +1062,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
|
||||||
unsigned long user_addr;
|
unsigned long user_addr;
|
||||||
size_t bytes;
|
size_t bytes;
|
||||||
struct buffer_head map_bh = { 0, };
|
struct buffer_head map_bh = { 0, };
|
||||||
|
struct blk_plug plug;
|
||||||
|
|
||||||
if (rw & WRITE)
|
if (rw & WRITE)
|
||||||
rw = WRITE_ODIRECT;
|
rw = WRITE_ODIRECT;
|
||||||
|
@ -1177,6 +1178,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
|
||||||
PAGE_SIZE - user_addr / PAGE_SIZE);
|
PAGE_SIZE - user_addr / PAGE_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
blk_start_plug(&plug);
|
||||||
|
|
||||||
for (seg = 0; seg < nr_segs; seg++) {
|
for (seg = 0; seg < nr_segs; seg++) {
|
||||||
user_addr = (unsigned long)iov[seg].iov_base;
|
user_addr = (unsigned long)iov[seg].iov_base;
|
||||||
sdio.size += bytes = iov[seg].iov_len;
|
sdio.size += bytes = iov[seg].iov_len;
|
||||||
|
@ -1235,6 +1238,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
|
||||||
if (sdio.bio)
|
if (sdio.bio)
|
||||||
dio_bio_submit(dio, &sdio);
|
dio_bio_submit(dio, &sdio);
|
||||||
|
|
||||||
|
blk_finish_plug(&plug);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* It is possible that, we return short IO due to end of file.
|
* It is possible that, we return short IO due to end of file.
|
||||||
* In that case, we need to release all the pages we got hold on.
|
* In that case, we need to release all the pages we got hold on.
|
||||||
|
|
|
@ -601,7 +601,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
|
||||||
* it already be started by driver.
|
* it already be started by driver.
|
||||||
*/
|
*/
|
||||||
#define RQ_NOMERGE_FLAGS \
|
#define RQ_NOMERGE_FLAGS \
|
||||||
(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
|
(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD)
|
||||||
#define rq_mergeable(rq) \
|
#define rq_mergeable(rq) \
|
||||||
(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
|
(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
|
||||||
(((rq)->cmd_flags & REQ_DISCARD) || \
|
(((rq)->cmd_flags & REQ_DISCARD) || \
|
||||||
|
@ -894,6 +894,8 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
|
||||||
extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
|
extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
|
||||||
|
|
||||||
extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
|
extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
|
||||||
|
extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
|
||||||
|
struct scatterlist *sglist);
|
||||||
extern void blk_dump_rq_flags(struct request *, char *);
|
extern void blk_dump_rq_flags(struct request *, char *);
|
||||||
extern long nr_blockdev_pages(void);
|
extern long nr_blockdev_pages(void);
|
||||||
|
|
||||||
|
@ -1139,6 +1141,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector
|
||||||
& (lim->discard_granularity - 1);
|
& (lim->discard_granularity - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int bdev_discard_alignment(struct block_device *bdev)
|
||||||
|
{
|
||||||
|
struct request_queue *q = bdev_get_queue(bdev);
|
||||||
|
|
||||||
|
if (bdev != bdev->bd_contains)
|
||||||
|
return bdev->bd_part->discard_alignment;
|
||||||
|
|
||||||
|
return q->limits.discard_alignment;
|
||||||
|
}
|
||||||
|
|
||||||
static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
|
static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
|
||||||
{
|
{
|
||||||
if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
|
if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
|
||||||
|
|
|
@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
|
||||||
retval = filemap_write_and_wait_range(mapping, pos,
|
retval = filemap_write_and_wait_range(mapping, pos,
|
||||||
pos + iov_length(iov, nr_segs) - 1);
|
pos + iov_length(iov, nr_segs) - 1);
|
||||||
if (!retval) {
|
if (!retval) {
|
||||||
struct blk_plug plug;
|
|
||||||
|
|
||||||
blk_start_plug(&plug);
|
|
||||||
retval = mapping->a_ops->direct_IO(READ, iocb,
|
retval = mapping->a_ops->direct_IO(READ, iocb,
|
||||||
iov, pos, nr_segs);
|
iov, pos, nr_segs);
|
||||||
blk_finish_plug(&plug);
|
|
||||||
}
|
}
|
||||||
if (retval > 0) {
|
if (retval > 0) {
|
||||||
*ppos = pos + retval;
|
*ppos = pos + retval;
|
||||||
|
@ -2527,14 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
|
||||||
{
|
{
|
||||||
struct file *file = iocb->ki_filp;
|
struct file *file = iocb->ki_filp;
|
||||||
struct inode *inode = file->f_mapping->host;
|
struct inode *inode = file->f_mapping->host;
|
||||||
struct blk_plug plug;
|
|
||||||
ssize_t ret;
|
ssize_t ret;
|
||||||
|
|
||||||
BUG_ON(iocb->ki_pos != pos);
|
BUG_ON(iocb->ki_pos != pos);
|
||||||
|
|
||||||
sb_start_write(inode->i_sb);
|
sb_start_write(inode->i_sb);
|
||||||
mutex_lock(&inode->i_mutex);
|
mutex_lock(&inode->i_mutex);
|
||||||
blk_start_plug(&plug);
|
|
||||||
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
|
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
|
||||||
mutex_unlock(&inode->i_mutex);
|
mutex_unlock(&inode->i_mutex);
|
||||||
|
|
||||||
|
@ -2545,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
|
||||||
if (err < 0 && ret > 0)
|
if (err < 0 && ret > 0)
|
||||||
ret = err;
|
ret = err;
|
||||||
}
|
}
|
||||||
blk_finish_plug(&plug);
|
|
||||||
sb_end_write(inode->i_sb);
|
sb_end_write(inode->i_sb);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue