linux/drivers/md/bcache/journal.h

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHE_JOURNAL_H
#define _BCACHE_JOURNAL_H

/*
 * THE JOURNAL:
 *
 * The journal is treated as a circular buffer of buckets - a journal entry
 * never spans two buckets. This means (not implemented yet) we can resize the
 * journal at runtime, and will be needed for bcache on raw flash support.
 *
 * Journal entries contain a list of keys, ordered by the time they were
 * inserted; thus journal replay just has to reinsert the keys.
 *
 * We also keep some things in the journal header that are logically part of the
 * superblock - all the things that are frequently updated. This is for future
 * bcache on raw flash support; the superblock (which will become another
 * journal) can't be moved or wear leveled, so it contains just enough
 * information to find the main journal, and the superblock only has to be
 * rewritten when we want to move/wear level the main journal.
 *
 * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be
 * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions
 * from cache misses, which don't have to be journaled, and for writeback and
 * moving gc we work around it by flushing the btree to disk before updating the
 * gc information. But it is a potential issue with incremental garbage
 * collection, and it's fragile.
 *
 * OPEN JOURNAL ENTRIES:
 *
 * Each journal entry contains, in the header, the sequence number of the last
 * journal entry still open - i.e. that has keys that haven't been flushed to
 * disk in the btree.
 *
 * We track this by maintaining a refcount for every open journal entry, in a
 * fifo; each entry in the fifo corresponds to a particular journal
 * entry/sequence number. When the refcount at the tail of the fifo goes to
 * zero, we pop it off - thus, the size of the fifo tells us the number of open
 * journal entries
 *
 * We take a refcount on a journal entry when we add some keys to a journal
 * entry that we're going to insert (held by struct btree_op), and then when we
 * insert those keys into the btree the btree write we're setting up takes a
 * copy of that refcount (held by struct btree_write). That refcount is dropped
 * when the btree write completes.
 *
 * A struct btree_write can only hold a refcount on a single journal entry, but
 * might contain keys for many journal entries - we handle this by making sure
 * it always has a refcount on the _oldest_ journal entry of all the journal
 * entries it has keys for.
 *
 * JOURNAL RECLAIM:
 *
 * As mentioned previously, our fifo of refcounts tells us the number of open
 * journal entries; from that and the current journal sequence number we compute
 * last_seq - the oldest journal entry we still need. We write last_seq in each
 * journal entry, and we also have to keep track of where it exists on disk so
 * we don't overwrite it when we loop around the journal.
 *
 * To do that we track, for each journal bucket, the sequence number of the
 * newest journal entry it contains - if we don't need that journal entry we
 * don't need anything in that bucket anymore. From that we track the last
 * journal bucket we still need; all this is tracked in struct journal_device
 * and updated by journal_reclaim().
 *
 * JOURNAL FILLING UP:
 *
 * There are two ways the journal could fill up; either we could run out of
 * space to write to, or we could have too many open journal entries and run out
 * of room in the fifo of refcounts. Since those refcounts are decremented
 * without any locking we can't safely resize that fifo, so we handle it the
 * same way.
 *
 * If the journal fills up, we start flushing dirty btree nodes until we can
 * allocate space for a journal write again - preferentially flushing btree
 * nodes that are pinning the oldest journal entries first.
 */

/*
 * Only used for holding the journal entries we read in btree_journal_read()
 * during cache_registration
 */
struct journal_replay {
	struct list_head	list;
	atomic_t		*pin;
	struct jset		j;
};

/*
 * We put two of these in struct journal; we used them for writes to the
 * journal that are being staged or in flight.
 */
struct journal_write {
	struct jset		*data;
#define JSET_BITS		3

	struct cache_set	*c;
	struct closure_waitlist	wait;
	bool			dirty;
	bool			need_write;
};

/* Embedded in struct cache_set */
struct journal {
	spinlock_t		lock;
	/* used when waiting because the journal was full */
	struct closure_waitlist	wait;
	struct closure		io;
	int			io_in_flight;
	struct delayed_work	work;

	/* Number of blocks free in the bucket(s) we're currently writing to */
	unsigned		blocks_free;
	uint64_t		seq;
	DECLARE_FIFO(atomic_t, pin);

	BKEY_PADDED(key);

	struct journal_write	w[2], *cur;
};

/*
 * Embedded in struct cache. First three fields refer to the array of journal
 * buckets, in cache_sb.
 */
struct journal_device {
	/*
	 * For each journal bucket, contains the max sequence number of the
	 * journal writes it contains - so we know when a bucket can be reused.
	 */
	uint64_t		seq[SB_JOURNAL_BUCKETS];

	/* Journal bucket we're currently writing to */
	unsigned		cur_idx;

	/* Last journal bucket that still contains an open journal entry */
	unsigned		last_idx;

	/* Next journal bucket to be discarded */
	unsigned		discard_idx;

#define DISCARD_READY		0
#define DISCARD_IN_FLIGHT	1
#define DISCARD_DONE		2
	/* 1 - discard in flight, -1 - discard completed */
	atomic_t		discard_in_flight;

	struct work_struct	discard_work;
	struct bio		discard_bio;
	struct bio_vec		discard_bv;

	/* Bio for journal reads/writes to this device */
	struct bio		bio;
	struct bio_vec		bv[8];
};

#define journal_pin_cmp(c, l, r)				\
	(fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))

#define JOURNAL_PIN	20000

#define journal_full(j)						\
	(!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)

struct closure;
struct cache_set;
struct btree_op;
struct keylist;

atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *);
void bch_journal_next(struct journal *);
void bch_journal_mark(struct cache_set *, struct list_head *);
void bch_journal_meta(struct cache_set *, struct closure *);
int bch_journal_read(struct cache_set *, struct list_head *);
int bch_journal_replay(struct cache_set *, struct list_head *);

void bch_journal_free(struct cache_set *);
int bch_journal_alloc(struct cache_set *);

#endif /* _BCACHE_JOURNAL_H */
License cleanup: add SPDX GPL-2.0 license identifier to files with no license Many source files in the tree are missing licensing information, which makes it harder for compliance tools to determine the correct license. By default all files without license information are under the default license of the kernel, which is GPL version 2. Update the files which contain no license information with the 'GPL-2.0' SPDX license identifier. The SPDX identifier is a legally binding shorthand, which can be used instead of the full boiler plate text. This patch is based on work done by Thomas Gleixner and Kate Stewart and Philippe Ombredanne. How this work was done: Patches were generated and checked against linux-4.14-rc6 for a subset of the use cases: - file had no licensing information it it. - file was a /uapi/ one with no licensing information in it, - file was a /uapi/ one with existing licensing information, Further patches will be generated in subsequent months to fix up cases where non-standard license headers were used, and references to license had to be inferred by heuristics based on keywords. The analysis to determine which SPDX License Identifier to be applied to a file was done in a spreadsheet of side by side results from of the output of two independent scanners (ScanCode & Windriver) producing SPDX tag:value files created by Philippe Ombredanne. Philippe prepared the base worksheet, and did an initial spot review of a few 1000 files. The 4.13 kernel was the starting point of the analysis with 60,537 files assessed. Kate Stewart did a file by file comparison of the scanner results in the spreadsheet to determine which SPDX license identifier(s) to be applied to the file. She confirmed any determination that was not immediately clear with lawyers working with the Linux Foundation. Criteria used to select files for SPDX license identifier tagging was: - Files considered eligible had to be source code files. - Make and config files were included as candidates if they contained >5 lines of source - File already had some variant of a license header in it (even if <5 lines). All documentation files were explicitly excluded. The following heuristics were used to determine which SPDX license identifiers to apply. - when both scanners couldn't find any license traces, file was considered to have no license information in it, and the top level COPYING file license applied. For non /uapi/ files that summary was: SPDX license identifier # files ---------------------------------------------------\|------- GPL-2.0 11139 and resulted in the first patch in this series. If that file was a /uapi/ path one, it was "GPL-2.0 WITH Linux-syscall-note" otherwise it was "GPL-2.0". Results of that was: SPDX license identifier # files ---------------------------------------------------\|------- GPL-2.0 WITH Linux-syscall-note 930 and resulted in the second patch in this series. - if a file had some form of licensing information in it, and was one of the /uapi/ ones, it was denoted with the Linux-syscall-note if any GPL family license was found in the file or had no licensing in it (per prior point). Results summary: SPDX license identifier # files ---------------------------------------------------\|------ GPL-2.0 WITH Linux-syscall-note 270 GPL-2.0+ WITH Linux-syscall-note 169 ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) 21 ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) 17 LGPL-2.1+ WITH Linux-syscall-note 15 GPL-1.0+ WITH Linux-syscall-note 14 ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) 5 LGPL-2.0+ WITH Linux-syscall-note 4 LGPL-2.1 WITH Linux-syscall-note 3 ((GPL-2.0 WITH Linux-syscall-note) OR MIT) 3 ((GPL-2.0 WITH Linux-syscall-note) AND MIT) 1 and that resulted in the third patch in this series. - when the two scanners agreed on the detected license(s), that became the concluded license(s). - when there was disagreement between the two scanners (one detected a license but the other didn't, or they both detected different licenses) a manual inspection of the file occurred. - In most cases a manual inspection of the information in the file resulted in a clear resolution of the license that should apply (and which scanner probably needed to revisit its heuristics). - When it was not immediately clear, the license identifier was confirmed with lawyers working with the Linux Foundation. - If there was any question as to the appropriate license identifier, the file was flagged for further research and to be revisited later in time. In total, over 70 hours of logged manual review was done on the spreadsheet to determine the SPDX license identifiers to apply to the source files by Kate, Philippe, Thomas and, in some cases, confirmation by lawyers working with the Linux Foundation. Kate also obtained a third independent scan of the 4.13 code base from FOSSology, and compared selected files where the other two scanners disagreed against that SPDX file, to see if there was new insights. The Windriver scanner is based on an older version of FOSSology in part, so they are related. Thomas did random spot checks in about 500 files from the spreadsheets for the uapi headers and agreed with SPDX license identifier in the files he inspected. For the non-uapi files Thomas did random spot checks in about 15000 files. In initial set of patches against 4.14-rc6, 3 files were found to have copy/paste license identifier errors, and have been fixed to reflect the correct identifier. Additionally Philippe spent 10 hours this week doing a detailed manual inspection and review of the 12,461 patched files from the initial patch version early this week with: - a full scancode scan run, collecting the matched texts, detected license ids and scores - reviewing anything where there was a license detected (about 500+ files) to ensure that the applied SPDX license was correct - reviewing anything where there was no detection but the patch license was not GPL-2.0 WITH Linux-syscall-note to ensure that the applied SPDX license was correct This produced a worksheet with 20 files needing minor correction. This worksheet was then exported into 3 different .csv files for the different types of files to be modified. These .csv files were then reviewed by Greg. Thomas wrote a script to parse the csv files and add the proper SPDX tag to the file, in the format that the file expected. This script was further refined by Greg based on the output to detect more types of files automatically and to distinguish between header and source .c files (which need different comment types.) Finally Greg ran the script using the .csv files to generate the patches. Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org> Reviewed-by: Philippe Ombredanne <pombredanne@nexb.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2017-11-01 15:07:57 +01:00			`/* SPDX-License-Identifier: GPL-2.0 */`
bcache: A block layer cache Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet <koverstreet@google.com> 2013-03-24 00:11:31 +01:00			`#ifndef _BCACHE_JOURNAL_H`
			`#define _BCACHE_JOURNAL_H`

			`/*`
			`* THE JOURNAL:`
			`*`
			`* The journal is treated as a circular buffer of buckets - a journal entry`
			`* never spans two buckets. This means (not implemented yet) we can resize the`
			`* journal at runtime, and will be needed for bcache on raw flash support.`
			`*`
			`* Journal entries contain a list of keys, ordered by the time they were`
			`* inserted; thus journal replay just has to reinsert the keys.`
			`*`
			`* We also keep some things in the journal header that are logically part of the`
			`* superblock - all the things that are frequently updated. This is for future`
			`* bcache on raw flash support; the superblock (which will become another`
			`* journal) can't be moved or wear leveled, so it contains just enough`
			`* information to find the main journal, and the superblock only has to be`
			`* rewritten when we want to move/wear level the main journal.`
			`*`
			`* Currently, we don't journal BTREE_REPLACE operations - this will hopefully be`
			`* fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions`
			`* from cache misses, which don't have to be journaled, and for writeback and`
			`* moving gc we work around it by flushing the btree to disk before updating the`
			`* gc information. But it is a potential issue with incremental garbage`
			`* collection, and it's fragile.`
			`*`
			`* OPEN JOURNAL ENTRIES:`
			`*`
			`* Each journal entry contains, in the header, the sequence number of the last`
			`* journal entry still open - i.e. that has keys that haven't been flushed to`
			`* disk in the btree.`
			`*`
			`* We track this by maintaining a refcount for every open journal entry, in a`
			`* fifo; each entry in the fifo corresponds to a particular journal`
			`* entry/sequence number. When the refcount at the tail of the fifo goes to`
			`* zero, we pop it off - thus, the size of the fifo tells us the number of open`
			`* journal entries`
			`*`
			`* We take a refcount on a journal entry when we add some keys to a journal`
			`* entry that we're going to insert (held by struct btree_op), and then when we`
			`* insert those keys into the btree the btree write we're setting up takes a`
			`* copy of that refcount (held by struct btree_write). That refcount is dropped`
			`* when the btree write completes.`
			`*`
			`* A struct btree_write can only hold a refcount on a single journal entry, but`
			`* might contain keys for many journal entries - we handle this by making sure`
			`* it always has a refcount on the _oldest_ journal entry of all the journal`
			`* entries it has keys for.`
			`*`
			`* JOURNAL RECLAIM:`
			`*`
			`* As mentioned previously, our fifo of refcounts tells us the number of open`
			`* journal entries; from that and the current journal sequence number we compute`
			`* last_seq - the oldest journal entry we still need. We write last_seq in each`
			`* journal entry, and we also have to keep track of where it exists on disk so`
			`* we don't overwrite it when we loop around the journal.`
			`*`
			`* To do that we track, for each journal bucket, the sequence number of the`
			`* newest journal entry it contains - if we don't need that journal entry we`
			`* don't need anything in that bucket anymore. From that we track the last`
			`* journal bucket we still need; all this is tracked in struct journal_device`
			`* and updated by journal_reclaim().`
			`*`
			`* JOURNAL FILLING UP:`
			`*`
			`* There are two ways the journal could fill up; either we could run out of`
			`* space to write to, or we could have too many open journal entries and run out`
			`* of room in the fifo of refcounts. Since those refcounts are decremented`
			`* without any locking we can't safely resize that fifo, so we handle it the`
			`* same way.`
			`*`
			`* If the journal fills up, we start flushing dirty btree nodes until we can`
			`* allocate space for a journal write again - preferentially flushing btree`
			`* nodes that are pinning the oldest journal entries first.`
			`*/`

			`/*`
			`* Only used for holding the journal entries we read in btree_journal_read()`
			`* during cache_registration`
			`*/`
			`struct journal_replay {`
			`struct list_head list;`
			`atomic_t *pin;`
			`struct jset j;`
			`};`

			`/*`
			`* We put two of these in struct journal; we used them for writes to the`
			`* journal that are being staged or in flight.`
			`*/`
			`struct journal_write {`
			`struct jset *data;`
			`#define JSET_BITS 3`

			`struct cache_set *c;`
			`struct closure_waitlist wait;`
bcache: Fix a shutdown bug Shutdown wasn't cancelling/waiting on journal_write_work() Signed-off-by: Kent Overstreet <kmo@daterainc.com> 2014-02-20 04:48:26 +01:00			`bool dirty;`
bcache: A block layer cache Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet <koverstreet@google.com> 2013-03-24 00:11:31 +01:00			`bool need_write;`
			`};`

			`/* Embedded in struct cache_set */`
			`struct journal {`
			`spinlock_t lock;`
			`/* used when waiting because the journal was full */`
			`struct closure_waitlist wait;`
bcache: Fix a journalling performance bug 2013-10-09 00:50:46 +02:00			`struct closure io;`
bcache: kill closure locking usage Signed-off-by: Kent Overstreet <kmo@daterainc.com> 2013-12-17 00:27:25 +01:00			`int io_in_flight;`
bcache: Fix a journalling performance bug 2013-10-09 00:50:46 +02:00			`struct delayed_work work;`
bcache: A block layer cache Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet <koverstreet@google.com> 2013-03-24 00:11:31 +01:00
			`/* Number of blocks free in the bucket(s) we're currently writing to */`
			`unsigned blocks_free;`
			`uint64_t seq;`
			`DECLARE_FIFO(atomic_t, pin);`

			`BKEY_PADDED(key);`

			`struct journal_write w[2], *cur;`
			`};`

			`/*`
			`* Embedded in struct cache. First three fields refer to the array of journal`
			`* buckets, in cache_sb.`
			`*/`
			`struct journal_device {`
			`/*`
			`* For each journal bucket, contains the max sequence number of the`
			`* journal writes it contains - so we know when a bucket can be reused.`
			`*/`
			`uint64_t seq[SB_JOURNAL_BUCKETS];`

			`/* Journal bucket we're currently writing to */`
			`unsigned cur_idx;`

			`/* Last journal bucket that still contains an open journal entry */`
			`unsigned last_idx;`

			`/* Next journal bucket to be discarded */`
			`unsigned discard_idx;`

			`#define DISCARD_READY 0`
			`#define DISCARD_IN_FLIGHT 1`
			`#define DISCARD_DONE 2`
			`/* 1 - discard in flight, -1 - discard completed */`
			`atomic_t discard_in_flight;`

			`struct work_struct discard_work;`
			`struct bio discard_bio;`
			`struct bio_vec discard_bv;`

			`/* Bio for journal reads/writes to this device */`
			`struct bio bio;`
			`struct bio_vec bv[8];`
			`};`

			`#define journal_pin_cmp(c, l, r) \`
bcache: Prune struct btree_op Eventual goal is for struct btree_op to contain only what is necessary for traversing the btree. Signed-off-by: Kent Overstreet <kmo@daterainc.com> 2013-07-25 02:44:17 +02:00			`(fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))`
bcache: A block layer cache Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet <koverstreet@google.com> 2013-03-24 00:11:31 +01:00
			`#define JOURNAL_PIN 20000`

			`#define journal_full(j) \`
			`(!(j)->blocks_free \|\| fifo_free(&(j)->pin) <= 1)`

			`struct closure;`
			`struct cache_set;`
			`struct btree_op;`
bcache: Refactor journalling flow control Making things less asynchronous that don't need to be - bch_journal() only has to block when the journal or journal entry is full, which is emphatically not a fast path. So make it a normal function that just returns when it finishes, to make the code and control flow easier to follow. Signed-off-by: Kent Overstreet <kmo@daterainc.com> 2013-10-25 02:07:04 +02:00			`struct keylist;`
bcache: A block layer cache Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet <koverstreet@google.com> 2013-03-24 00:11:31 +01:00
bcache: Refactor journalling flow control Making things less asynchronous that don't need to be - bch_journal() only has to block when the journal or journal entry is full, which is emphatically not a fast path. So make it a normal function that just returns when it finishes, to make the code and control flow easier to follow. Signed-off-by: Kent Overstreet <kmo@daterainc.com> 2013-10-25 02:07:04 +02:00			`atomic_t bch_journal(struct cache_set , struct keylist , struct closure );`
bcache: A block layer cache Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet <koverstreet@google.com> 2013-03-24 00:11:31 +01:00			`void bch_journal_next(struct journal *);`
			`void bch_journal_mark(struct cache_set , struct list_head );`
			`void bch_journal_meta(struct cache_set , struct closure );`
bcache: Prune struct btree_op Eventual goal is for struct btree_op to contain only what is necessary for traversing the btree. Signed-off-by: Kent Overstreet <kmo@daterainc.com> 2013-07-25 02:44:17 +02:00			`int bch_journal_read(struct cache_set , struct list_head );`
			`int bch_journal_replay(struct cache_set , struct list_head );`
bcache: A block layer cache Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet <koverstreet@google.com> 2013-03-24 00:11:31 +01:00
			`void bch_journal_free(struct cache_set *);`
			`int bch_journal_alloc(struct cache_set *);`

			`#endif /* _BCACHE_JOURNAL_H */`