diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index ed3e5e949fce..f35473f8c630 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -24,8 +24,6 @@ DMA-ISA-LPC.txt - How to do DMA with ISA (and LPC) devices. DMA-attributes.txt - listing of the various possible attributes a DMA region can have -DocBook/ - - directory with DocBook templates etc. for kernel documentation. EDID/ - directory with info on customizing EDID for broken gfx/displays. IPMI.txt @@ -40,8 +38,6 @@ Intel-IOMMU.txt - basic info on the Intel IOMMU virtualization support. Makefile - It's not of interest for those who aren't touching the build system. -Makefile.sphinx - - It's not of interest for those who aren't touching the build system. PCI/ - info related to PCI drivers. RCU/ @@ -264,6 +260,8 @@ logo.gif - full colour GIF image of Linux logo (penguin - Tux). logo.txt - info on creator of above logo & site to get additional images from. +lsm.txt + - Linux Security Modules: General Security Hooks for Linux lzo.txt - kernel LZO decompressor input formats m68k/ diff --git a/Documentation/DocBook/.gitignore b/Documentation/DocBook/.gitignore deleted file mode 100644 index e05da3f7aa21..000000000000 --- a/Documentation/DocBook/.gitignore +++ /dev/null @@ -1,17 +0,0 @@ -*.xml -*.ps -*.pdf -*.html -*.9.gz -*.9 -*.aux -*.dvi -*.log -*.out -*.png -*.gif -*.svg -*.proc -*.db -media-indices.tmpl -media-entities.tmpl diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile deleted file mode 100644 index 85916f13d330..000000000000 --- a/Documentation/DocBook/Makefile +++ /dev/null @@ -1,282 +0,0 @@ -### -# This makefile is used to generate the kernel documentation, -# primarily based on in-line comments in various source files. -# See Documentation/kernel-doc-nano-HOWTO.txt for instruction in how -# to document the SRC - and how to read it. -# To add a new book the only step required is to add the book to the -# list of DOCBOOKS. - -DOCBOOKS := z8530book.xml \ - kernel-hacking.xml kernel-locking.xml \ - networking.xml \ - filesystems.xml lsm.xml kgdb.xml \ - libata.xml mtdnand.xml librs.xml rapidio.xml \ - s390-drivers.xml scsi.xml \ - sh.xml w1.xml - -ifeq ($(DOCBOOKS),) - -# Skip DocBook build if the user explicitly requested no DOCBOOKS. -.DEFAULT: - @echo " SKIP DocBook $@ target (DOCBOOKS=\"\" specified)." -else -ifneq ($(SPHINXDIRS),) - -# Skip DocBook build if the user explicitly requested a sphinx dir -.DEFAULT: - @echo " SKIP DocBook $@ target (SPHINXDIRS specified)." -else - - -### -# The build process is as follows (targets): -# (xmldocs) [by docproc] -# file.tmpl --> file.xml +--> file.ps (psdocs) [by db2ps or xmlto] -# +--> file.pdf (pdfdocs) [by db2pdf or xmlto] -# +--> DIR=file (htmldocs) [by xmlto] -# +--> man/ (mandocs) [by xmlto] - - -# for PDF and PS output you can choose between xmlto and docbook-utils tools -PDF_METHOD = $(prefer-db2x) -PS_METHOD = $(prefer-db2x) - - -targets += $(DOCBOOKS) -BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) -xmldocs: $(BOOKS) -sgmldocs: xmldocs - -PS := $(patsubst %.xml, %.ps, $(BOOKS)) -psdocs: $(PS) - -PDF := $(patsubst %.xml, %.pdf, $(BOOKS)) -pdfdocs: $(PDF) - -HTML := $(sort $(patsubst %.xml, %.html, $(BOOKS))) -htmldocs: $(HTML) - $(call cmd,build_main_index) - -MAN := $(patsubst %.xml, %.9, $(BOOKS)) -mandocs: $(MAN) - find $(obj)/man -name '*.9' | xargs gzip -nf - -# Default location for installed man pages -export INSTALL_MAN_PATH = $(objtree)/usr - -installmandocs: mandocs - mkdir -p $(INSTALL_MAN_PATH)/man/man9/ - find $(obj)/man -name '*.9.gz' -printf '%h %f\n' | \ - sort -k 2 -k 1 | uniq -f 1 | sed -e 's: :/:' | \ - xargs install -m 644 -t $(INSTALL_MAN_PATH)/man/man9/ - -# no-op for the DocBook toolchain -epubdocs: -latexdocs: -linkcheckdocs: - -### -#External programs used -KERNELDOCXMLREF = $(srctree)/scripts/kernel-doc-xml-ref -KERNELDOC = $(srctree)/scripts/kernel-doc -DOCPROC = $(objtree)/scripts/docproc -CHECK_LC_CTYPE = $(objtree)/scripts/check-lc_ctype - -# Use a fixed encoding - UTF-8 if the C library has support built-in -# or ASCII if not -LC_CTYPE := $(call try-run, LC_CTYPE=C.UTF-8 $(CHECK_LC_CTYPE),C.UTF-8,C) -export LC_CTYPE - -XMLTOFLAGS = -m $(srctree)/$(src)/stylesheet.xsl -XMLTOFLAGS += --skip-validation - -### -# DOCPROC is used for two purposes: -# 1) To generate a dependency list for a .tmpl file -# 2) To preprocess a .tmpl file and call kernel-doc with -# appropriate parameters. -# The following rules are used to generate the .xml documentation -# required to generate the final targets. (ps, pdf, html). -quiet_cmd_docproc = DOCPROC $@ - cmd_docproc = SRCTREE=$(srctree)/ $(DOCPROC) doc $< >$@ -define rule_docproc - set -e; \ - $(if $($(quiet)cmd_$(1)),echo ' $($(quiet)cmd_$(1))';) \ - $(cmd_$(1)); \ - ( \ - echo 'cmd_$@ := $(cmd_$(1))'; \ - echo $@: `SRCTREE=$(srctree) $(DOCPROC) depend $<`; \ - ) > $(dir $@).$(notdir $@).cmd -endef - -%.xml: %.tmpl $(KERNELDOC) $(DOCPROC) $(KERNELDOCXMLREF) FORCE - $(call if_changed_rule,docproc) - -# Tell kbuild to always build the programs -always := $(hostprogs-y) - -notfoundtemplate = echo "*** You have to install docbook-utils or xmlto ***"; \ - exit 1 -db2xtemplate = db2TYPE -o $(dir $@) $< -xmltotemplate = xmlto TYPE $(XMLTOFLAGS) -o $(dir $@) $< - -# determine which methods are available -ifeq ($(shell which db2ps >/dev/null 2>&1 && echo found),found) - use-db2x = db2x - prefer-db2x = db2x -else - use-db2x = notfound - prefer-db2x = $(use-xmlto) -endif -ifeq ($(shell which xmlto >/dev/null 2>&1 && echo found),found) - use-xmlto = xmlto - prefer-xmlto = xmlto -else - use-xmlto = notfound - prefer-xmlto = $(use-db2x) -endif - -# the commands, generated from the chosen template -quiet_cmd_db2ps = PS $@ - cmd_db2ps = $(subst TYPE,ps, $($(PS_METHOD)template)) -%.ps : %.xml - $(call cmd,db2ps) - -quiet_cmd_db2pdf = PDF $@ - cmd_db2pdf = $(subst TYPE,pdf, $($(PDF_METHOD)template)) -%.pdf : %.xml - $(call cmd,db2pdf) - - -index = index.html -main_idx = $(obj)/$(index) -quiet_cmd_build_main_index = HTML $(main_idx) - cmd_build_main_index = rm -rf $(main_idx); \ - echo '

Linux Kernel HTML Documentation

' >> $(main_idx) && \ - echo '

Kernel Version: $(KERNELVERSION)

' >> $(main_idx) && \ - cat $(HTML) >> $(main_idx) - -quiet_cmd_db2html = HTML $@ - cmd_db2html = xmlto html $(XMLTOFLAGS) -o $(patsubst %.html,%,$@) $< && \ - echo ' \ - $(patsubst %.html,%,$(notdir $@))

' > $@ - -### -# Rules to create an aux XML and .db, and use them to re-process the DocBook XML -# to fill internal hyperlinks - gen_aux_xml = : - quiet_gen_aux_xml = echo ' XMLREF $@' -silent_gen_aux_xml = : -%.aux.xml: %.xml - @$($(quiet)gen_aux_xml) - @rm -rf $@ - @(cat $< | egrep "^ $<.db) - @$(KERNELDOCXMLREF) -db $<.db $< > $@ -.PRECIOUS: %.aux.xml - -%.html: %.aux.xml - @(which xmlto > /dev/null 2>&1) || \ - (echo "*** You need to install xmlto ***"; \ - exit 1) - @rm -rf $@ $(patsubst %.html,%,$@) - $(call cmd,db2html) - @if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \ - cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi - -quiet_cmd_db2man = MAN $@ - cmd_db2man = if grep -q refentry $<; then xmlto man $(XMLTOFLAGS) -o $(obj)/man/$(*F) $< ; fi -%.9 : %.xml - @(which xmlto > /dev/null 2>&1) || \ - (echo "*** You need to install xmlto ***"; \ - exit 1) - $(Q)mkdir -p $(obj)/man/$(*F) - $(call cmd,db2man) - @touch $@ - -### -# Rules to generate postscripts and PNG images from .fig format files -quiet_cmd_fig2eps = FIG2EPS $@ - cmd_fig2eps = fig2dev -Leps $< $@ - -%.eps: %.fig - @(which fig2dev > /dev/null 2>&1) || \ - (echo "*** You need to install transfig ***"; \ - exit 1) - $(call cmd,fig2eps) - -quiet_cmd_fig2png = FIG2PNG $@ - cmd_fig2png = fig2dev -Lpng $< $@ - -%.png: %.fig - @(which fig2dev > /dev/null 2>&1) || \ - (echo "*** You need to install transfig ***"; \ - exit 1) - $(call cmd,fig2png) - -### -# Rule to convert a .c file to inline XML documentation - gen_xml = : - quiet_gen_xml = echo ' GEN $@' -silent_gen_xml = : -%.xml: %.c - @$($(quiet)gen_xml) - @( \ - echo ""; \ - expand --tabs=8 < $< | \ - sed -e "s/&/\\&/g" \ - -e "s//\\>/g"; \ - echo "") > $@ - -endif # DOCBOOKS="" -endif # SPHINDIR=... - -### -# Help targets as used by the top-level makefile -dochelp: - @echo ' Linux kernel internal documentation in different formats (DocBook):' - @echo ' htmldocs - HTML' - @echo ' pdfdocs - PDF' - @echo ' psdocs - Postscript' - @echo ' xmldocs - XML DocBook' - @echo ' mandocs - man pages' - @echo ' installmandocs - install man pages generated by mandocs to INSTALL_MAN_PATH'; \ - echo ' (default: $(INSTALL_MAN_PATH))'; \ - echo '' - @echo ' cleandocs - clean all generated DocBook files' - @echo - @echo ' make DOCBOOKS="s1.xml s2.xml" [target] Generate only docs s1.xml s2.xml' - @echo ' valid values for DOCBOOKS are: $(DOCBOOKS)' - @echo - @echo " make DOCBOOKS=\"\" [target] Don't generate docs from Docbook" - @echo ' This is useful to generate only the ReST docs (Sphinx)' - - -### -# Temporary files left by various tools -clean-files := $(DOCBOOKS) \ - $(patsubst %.xml, %.dvi, $(DOCBOOKS)) \ - $(patsubst %.xml, %.aux, $(DOCBOOKS)) \ - $(patsubst %.xml, %.tex, $(DOCBOOKS)) \ - $(patsubst %.xml, %.log, $(DOCBOOKS)) \ - $(patsubst %.xml, %.out, $(DOCBOOKS)) \ - $(patsubst %.xml, %.ps, $(DOCBOOKS)) \ - $(patsubst %.xml, %.pdf, $(DOCBOOKS)) \ - $(patsubst %.xml, %.html, $(DOCBOOKS)) \ - $(patsubst %.xml, %.9, $(DOCBOOKS)) \ - $(patsubst %.xml, %.aux.xml, $(DOCBOOKS)) \ - $(patsubst %.xml, %.xml.db, $(DOCBOOKS)) \ - $(patsubst %.xml, %.xml, $(DOCBOOKS)) \ - $(patsubst %.xml, .%.xml.cmd, $(DOCBOOKS)) \ - $(index) - -clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man - -cleandocs: - $(Q)rm -f $(call objectify, $(clean-files)) - $(Q)rm -rf $(call objectify, $(clean-dirs)) - -# Declare the contents of the .PHONY variable as phony. We keep that -# information in a variable so we can use it in if_changed and friends. - -.PHONY: $(PHONY) diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl deleted file mode 100644 index 6006b6358c86..000000000000 --- a/Documentation/DocBook/filesystems.tmpl +++ /dev/null @@ -1,381 +0,0 @@ - - - - - - Linux Filesystems API - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - - - - - - - The Linux VFS - The Filesystem types -!Iinclude/linux/fs.h - - The Directory Cache -!Efs/dcache.c -!Iinclude/linux/dcache.h - - Inode Handling -!Efs/inode.c -!Efs/bad_inode.c - - Registration and Superblocks -!Efs/super.c - - File Locks -!Efs/locks.c -!Ifs/locks.c - - Other Functions -!Efs/mpage.c -!Efs/namei.c -!Efs/buffer.c -!Eblock/bio.c -!Efs/seq_file.c -!Efs/filesystems.c -!Efs/fs-writeback.c -!Efs/block_dev.c - - - - - The proc filesystem - - sysctl interface -!Ekernel/sysctl.c - - - proc filesystem interface -!Ifs/proc/base.c - - - - - Events based on file descriptors -!Efs/eventfd.c - - - - The Filesystem for Exporting Kernel Objects -!Efs/sysfs/file.c -!Efs/sysfs/symlink.c - - - - The debugfs filesystem - - debugfs interface -!Efs/debugfs/inode.c -!Efs/debugfs/file.c - - - - - - The Linux Journalling API - - - - Roger - Gammans - -

- rgammans@computer-surgery.co.uk -
- - - - - - - Stephen - Tweedie - -
- sct@redhat.com -
-
-
-
- - - 2002 - Roger Gammans - - - - The Linux Journalling API - - - Overview - - Details - -The journalling layer is easy to use. You need to -first of all create a journal_t data structure. There are -two calls to do this dependent on how you decide to allocate the physical -media on which the journal resides. The jbd2_journal_init_inode() call -is for journals stored in filesystem inodes, or the jbd2_journal_init_dev() -call can be used for journal stored on a raw device (in a continuous range -of blocks). A journal_t is a typedef for a struct pointer, so when -you are finally finished make sure you call jbd2_journal_destroy() on it -to free up any used kernel memory. - - - -Once you have got your journal_t object you need to 'mount' or load the journal -file. The journalling layer expects the space for the journal was already -allocated and initialized properly by the userspace tools. When loading the -journal you must call jbd2_journal_load() to process journal contents. If the -client file system detects the journal contents does not need to be processed -(or even need not have valid contents), it may call jbd2_journal_wipe() to -clear the journal contents before calling jbd2_journal_load(). - - - -Note that jbd2_journal_wipe(..,0) calls jbd2_journal_skip_recovery() for you if -it detects any outstanding transactions in the journal and similarly -jbd2_journal_load() will call jbd2_journal_recover() if necessary. I would -advise reading ext4_load_journal() in fs/ext4/super.c for examples on this -stage. - - - -Now you can go ahead and start modifying the underlying -filesystem. Almost. - - - - -You still need to actually journal your filesystem changes, this -is done by wrapping them into transactions. Additionally you -also need to wrap the modification of each of the buffers -with calls to the journal layer, so it knows what the modifications -you are actually making are. To do this use jbd2_journal_start() which -returns a transaction handle. - - - -jbd2_journal_start() -and its counterpart jbd2_journal_stop(), which indicates the end of a -transaction are nestable calls, so you can reenter a transaction if necessary, -but remember you must call jbd2_journal_stop() the same number of times as -jbd2_journal_start() before the transaction is completed (or more accurately -leaves the update phase). Ext4/VFS makes use of this feature to simplify -handling of inode dirtying, quota support, etc. - - - -Inside each transaction you need to wrap the modifications to the -individual buffers (blocks). Before you start to modify a buffer you -need to call jbd2_journal_get_{create,write,undo}_access() as appropriate, -this allows the journalling layer to copy the unmodified data if it -needs to. After all the buffer may be part of a previously uncommitted -transaction. -At this point you are at last ready to modify a buffer, and once -you are have done so you need to call jbd2_journal_dirty_{meta,}data(). -Or if you've asked for access to a buffer you now know is now longer -required to be pushed back on the device you can call jbd2_journal_forget() -in much the same way as you might have used bforget() in the past. - - - -A jbd2_journal_flush() may be called at any time to commit and checkpoint -all your transactions. - - - -Then at umount time , in your put_super() you can then call jbd2_journal_destroy() -to clean up your in-core journal object. - - - -Unfortunately there a couple of ways the journal layer can cause a deadlock. -The first thing to note is that each task can only have -a single outstanding transaction at any one time, remember nothing -commits until the outermost jbd2_journal_stop(). This means -you must complete the transaction at the end of each file/inode/address -etc. operation you perform, so that the journalling system isn't re-entered -on another journal. Since transactions can't be nested/batched -across differing journals, and another filesystem other than -yours (say ext4) may be modified in a later syscall. - - - -The second case to bear in mind is that jbd2_journal_start() can -block if there isn't enough space in the journal for your transaction -(based on the passed nblocks param) - when it blocks it merely(!) needs to -wait for transactions to complete and be committed from other tasks, -so essentially we are waiting for jbd2_journal_stop(). So to avoid -deadlocks you must treat jbd2_journal_start/stop() as if they -were semaphores and include them in your semaphore ordering rules to prevent -deadlocks. Note that jbd2_journal_extend() has similar blocking behaviour to -jbd2_journal_start() so you can deadlock here just as easily as on -jbd2_journal_start(). - - - -Try to reserve the right number of blocks the first time. ;-). This will -be the maximum number of blocks you are going to touch in this transaction. -I advise having a look at at least ext4_jbd.h to see the basis on which -ext4 uses to make these decisions. - - - -Another wriggle to watch out for is your on-disk block allocation strategy. -Why? Because, if you do a delete, you need to ensure you haven't reused any -of the freed blocks until the transaction freeing these blocks commits. If you -reused these blocks and crash happens, there is no way to restore the contents -of the reallocated blocks at the end of the last fully committed transaction. - -One simple way of doing this is to mark blocks as free in internal in-memory -block allocation structures only after the transaction freeing them commits. -Ext4 uses journal commit callback for this purpose. - - - -With journal commit callbacks you can ask the journalling layer to call a -callback function when the transaction is finally committed to disk, so that -you can do some of your own management. You ask the journalling layer for -calling the callback by simply setting journal->j_commit_callback function -pointer and that function is called after each transaction commit. You can also -use transaction->t_private_list for attaching entries to a transaction that -need processing when the transaction commits. - - - -JBD2 also provides a way to block all transaction updates via -jbd2_journal_{un,}lock_updates(). Ext4 uses this when it wants a window with a -clean and stable fs for a moment. E.g. - - - - - jbd2_journal_lock_updates() //stop new stuff happening.. - jbd2_journal_flush() // checkpoint everything. - ..do stuff on stable fs - jbd2_journal_unlock_updates() // carry on with filesystem use. - - - -The opportunities for abuse and DOS attacks with this should be obvious, -if you allow unprivileged userspace to trigger codepaths containing these -calls. - - - - - - Summary - -Using the journal is a matter of wrapping the different context changes, -being each mount, each modification (transaction) and each changed buffer -to tell the journalling layer about them. - - - - - - - - Data Types - - The journalling layer uses typedefs to 'hide' the concrete definitions - of the structures used. As a client of the JBD2 layer you can - just rely on the using the pointer as a magic cookie of some sort. - - Obviously the hiding is not enforced as this is 'C'. - - Structures -!Iinclude/linux/jbd2.h - - - - - Functions - - The functions here are split into two groups those that - affect a journal as a whole, and those which are used to - manage transactions - - Journal Level -!Efs/jbd2/journal.c -!Ifs/jbd2/recovery.c - - Transasction Level -!Efs/jbd2/transaction.c - - - - See also - - - - Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen Tweedie - - - - - - - Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen Tweedie - - - - - - - - - splice API - - splice is a method for moving blocks of data around inside the - kernel, without continually transferring them between the kernel - and user space. - -!Ffs/splice.c - - - - pipes API - - Pipe interfaces are all for in-kernel (builtin image) use. - They are not exported for use by modules. - -!Iinclude/linux/pipe_fs_i.h -!Ffs/pipe.c - - - diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl deleted file mode 100644 index da5c087462b1..000000000000 --- a/Documentation/DocBook/kernel-hacking.tmpl +++ /dev/null @@ -1,1312 +0,0 @@ - - - - - - Unreliable Guide To Hacking The Linux Kernel - - - - Rusty - Russell - -
- rusty@rustcorp.com.au -
-
-
-
- - - 2005 - Rusty Russell - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - - - - This is the first release of this document as part of the kernel tarball. - - -
- - - - - Introduction - - Welcome, gentle reader, to Rusty's Remarkably Unreliable Guide to Linux - Kernel Hacking. This document describes the common routines and - general requirements for kernel code: its goal is to serve as a - primer for Linux kernel development for experienced C - programmers. I avoid implementation details: that's what the - code is for, and I ignore whole tracts of useful routines. - - - Before you read this, please understand that I never wanted to - write this document, being grossly under-qualified, but I always - wanted to read it, and this was the only way. I hope it will - grow into a compendium of best practice, common starting points - and random information. - - - - - The Players - - - At any time each of the CPUs in a system can be: - - - - - - not associated with any process, serving a hardware interrupt; - - - - - - not associated with any process, serving a softirq or tasklet; - - - - - - running in kernel space, associated with a process (user context); - - - - - - running a process in user space. - - - - - - There is an ordering between these. The bottom two can preempt - each other, but above that is a strict hierarchy: each can only be - preempted by the ones above it. For example, while a softirq is - running on a CPU, no other softirq will preempt it, but a hardware - interrupt can. However, any other CPUs in the system execute - independently. - - - - We'll see a number of ways that the user context can block - interrupts, to become truly non-preemptable. - - - - User Context - - - User context is when you are coming in from a system call or other - trap: like userspace, you can be preempted by more important tasks - and by interrupts. You can sleep, by calling - schedule(). - - - - - You are always in user context on module load and unload, - and on operations on the block device layer. - - - - - In user context, the current pointer (indicating - the task we are currently executing) is valid, and - in_interrupt() - (include/linux/interrupt.h) is false - . - - - - - Beware that if you have preemption or softirqs disabled - (see below), in_interrupt() will return a - false positive. - - - - - - Hardware Interrupts (Hard IRQs) - - - Timer ticks, network cards and - keyboard are examples of real - hardware which produce interrupts at any time. The kernel runs - interrupt handlers, which services the hardware. The kernel - guarantees that this handler is never re-entered: if the same - interrupt arrives, it is queued (or dropped). Because it - disables interrupts, this handler has to be fast: frequently it - simply acknowledges the interrupt, marks a 'software interrupt' - for execution and exits. - - - - You can tell you are in a hardware interrupt, because - in_irq() returns true. - - - - Beware that this will return a false positive if interrupts are disabled - (see below). - - - - - - Software Interrupt Context: Softirqs and Tasklets - - - Whenever a system call is about to return to userspace, or a - hardware interrupt handler exits, any 'software interrupts' - which are marked pending (usually by hardware interrupts) are - run (kernel/softirq.c). - - - - Much of the real interrupt handling work is done here. Early in - the transition to SMP, there were only 'bottom - halves' (BHs), which didn't take advantage of multiple CPUs. Shortly - after we switched from wind-up computers made of match-sticks and snot, - we abandoned this limitation and switched to 'softirqs'. - - - - include/linux/interrupt.h lists the - different softirqs. A very important softirq is the - timer softirq (include/linux/timer.h): you can - register to have it call functions for you in a given length of - time. - - - - Softirqs are often a pain to deal with, since the same softirq - will run simultaneously on more than one CPU. For this reason, - tasklets (include/linux/interrupt.h) are more - often used: they are dynamically-registrable (meaning you can have - as many as you want), and they also guarantee that any tasklet - will only run on one CPU at any time, although different tasklets - can run simultaneously. - - - - The name 'tasklet' is misleading: they have nothing to do with 'tasks', - and probably more to do with some bad vodka Alexey Kuznetsov had at the - time. - - - - - You can tell you are in a softirq (or tasklet) - using the in_softirq() macro - (include/linux/interrupt.h). - - - - Beware that this will return a false positive if a bh lock (see below) - is held. - - - - - - - Some Basic Rules - - - - No memory protection - - - If you corrupt memory, whether in user context or - interrupt context, the whole machine will crash. Are you - sure you can't do what you want in userspace? - - - - - - No floating point or MMX - - - The FPU context is not saved; even in user - context the FPU state probably won't - correspond with the current process: you would mess with some - user process' FPU state. If you really want - to do this, you would have to explicitly save/restore the full - FPU state (and avoid context switches). It - is generally a bad idea; use fixed point arithmetic first. - - - - - - A rigid stack limit - - - Depending on configuration options the kernel stack is about 3K to 6K for most 32-bit architectures: it's - about 14K on most 64-bit archs, and often shared with interrupts - so you can't use it all. Avoid deep recursion and huge local - arrays on the stack (allocate them dynamically instead). - - - - - - The Linux kernel is portable - - - Let's keep it that way. Your code should be 64-bit clean, - and endian-independent. You should also minimize CPU - specific stuff, e.g. inline assembly should be cleanly - encapsulated and minimized to ease porting. Generally it - should be restricted to the architecture-dependent part of - the kernel tree. - - - - - - - - ioctls: Not writing a new system call - - - A system call generally looks like this - - - -asmlinkage long sys_mycall(int arg) -{ - return 0; -} - - - - First, in most cases you don't want to create a new system call. - You create a character device and implement an appropriate ioctl - for it. This is much more flexible than system calls, doesn't have - to be entered in every architecture's - include/asm/unistd.h and - arch/kernel/entry.S file, and is much more - likely to be accepted by Linus. - - - - If all your routine does is read or write some parameter, consider - implementing a sysfs interface instead. - - - - Inside the ioctl you're in user context to a process. When a - error occurs you return a negated errno (see - include/linux/errno.h), - otherwise you return 0. - - - - After you slept you should check if a signal occurred: the - Unix/Linux way of handling signals is to temporarily exit the - system call with the -ERESTARTSYS error. The - system call entry code will switch back to user context, process - the signal handler and then your system call will be restarted - (unless the user disabled that). So you should be prepared to - process the restart, e.g. if you're in the middle of manipulating - some data structure. - - - -if (signal_pending(current)) - return -ERESTARTSYS; - - - - If you're doing longer computations: first think userspace. If you - really want to do it in kernel you should - regularly check if you need to give up the CPU (remember there is - cooperative multitasking per CPU). Idiom: - - - -cond_resched(); /* Will sleep */ - - - - A short note on interface design: the UNIX system call motto is - "Provide mechanism not policy". - - - - - Recipes for Deadlock - - - You cannot call any routines which may sleep, unless: - - - - - You are in user context. - - - - - - You do not own any spinlocks. - - - - - - You have interrupts enabled (actually, Andi Kleen says - that the scheduling code will enable them for you, but - that's probably not what you wanted). - - - - - - Note that some functions may sleep implicitly: common ones are - the user space access functions (*_user) and memory allocation - functions without GFP_ATOMIC. - - - - You should always compile your kernel - CONFIG_DEBUG_ATOMIC_SLEEP on, and it will warn - you if you break these rules. If you do break - the rules, you will eventually lock up your box. - - - - Really. - - - - - Common Routines - - - - <function>printk()</function> - <filename class="headerfile">include/linux/kernel.h</filename> - - - - printk() feeds kernel messages to the - console, dmesg, and the syslog daemon. It is useful for debugging - and reporting errors, and can be used inside interrupt context, - but use with caution: a machine which has its console flooded with - printk messages is unusable. It uses a format string mostly - compatible with ANSI C printf, and C string concatenation to give - it a first "priority" argument: - - - -printk(KERN_INFO "i = %u\n", i); - - - - See include/linux/kernel.h; - for other KERN_ values; these are interpreted by syslog as the - level. Special case: for printing an IP address use - - - -__be32 ipaddress; -printk(KERN_INFO "my ip: %pI4\n", &ipaddress); - - - - printk() internally uses a 1K buffer and does - not catch overruns. Make sure that will be enough. - - - - - You will know when you are a real kernel hacker - when you start typoing printf as printk in your user programs :) - - - - - - - - Another sidenote: the original Unix Version 6 sources had a - comment on top of its printf function: "Printf should not be - used for chit-chat". You should follow that advice. - - - - - - - <function>copy_[to/from]_user()</function> - / - <function>get_user()</function> - / - <function>put_user()</function> - <filename class="headerfile">include/linux/uaccess.h</filename> - - - - [SLEEPS] - - - - put_user() and get_user() - are used to get and put single values (such as an int, char, or - long) from and to userspace. A pointer into userspace should - never be simply dereferenced: data should be copied using these - routines. Both return -EFAULT or 0. - - - copy_to_user() and - copy_from_user() are more general: they copy - an arbitrary amount of data to and from userspace. - - - Unlike put_user() and - get_user(), they return the amount of - uncopied data (ie. 0 still means - success). - - - [Yes, this moronic interface makes me cringe. The flamewar comes up every year or so. --RR.] - - - The functions may sleep implicitly. This should never be called - outside user context (it makes no sense), with interrupts - disabled, or a spinlock held. - - - - - <function>kmalloc()</function>/<function>kfree()</function> - <filename class="headerfile">include/linux/slab.h</filename> - - - [MAY SLEEP: SEE BELOW] - - - - These routines are used to dynamically request pointer-aligned - chunks of memory, like malloc and free do in userspace, but - kmalloc() takes an extra flag word. - Important values: - - - - - - - GFP_KERNEL - - - - - May sleep and swap to free memory. Only allowed in user - context, but is the most reliable way to allocate memory. - - - - - - - - GFP_ATOMIC - - - - - Don't sleep. Less reliable than GFP_KERNEL, - but may be called from interrupt context. You should - really have a good out-of-memory - error-handling strategy. - - - - - - - - GFP_DMA - - - - - Allocate ISA DMA lower than 16MB. If you don't know what that - is you don't need it. Very unreliable. - - - - - - - If you see a sleeping function called from invalid - context warning message, then maybe you called a - sleeping allocation function from interrupt context without - GFP_ATOMIC. You should really fix that. - Run, don't walk. - - - - If you are allocating at least PAGE_SIZE - (include/asm/page.h) bytes, - consider using __get_free_pages() - - (include/linux/mm.h). It - takes an order argument (0 for page sized, 1 for double page, 2 - for four pages etc.) and the same memory priority flag word as - above. - - - - If you are allocating more than a page worth of bytes you can use - vmalloc(). It'll allocate virtual memory in - the kernel map. This block is not contiguous in physical memory, - but the MMU makes it look like it is for you - (so it'll only look contiguous to the CPUs, not to external device - drivers). If you really need large physically contiguous memory - for some weird device, you have a problem: it is poorly supported - in Linux because after some time memory fragmentation in a running - kernel makes it hard. The best way is to allocate the block early - in the boot process via the alloc_bootmem() - routine. - - - - Before inventing your own cache of often-used objects consider - using a slab cache in - include/linux/slab.h - - - - - <function>current</function> - <filename class="headerfile">include/asm/current.h</filename> - - - This global variable (really a macro) contains a pointer to - the current task structure, so is only valid in user context. - For example, when a process makes a system call, this will - point to the task structure of the calling process. It is - not NULL in interrupt context. - - - - - <function>mdelay()</function>/<function>udelay()</function> - <filename class="headerfile">include/asm/delay.h</filename> - <filename class="headerfile">include/linux/delay.h</filename> - - - - The udelay() and ndelay() functions can be used for small pauses. - Do not use large values with them as you risk - overflow - the helper function mdelay() is useful - here, or consider msleep(). - - - - - <function>cpu_to_be32()</function>/<function>be32_to_cpu()</function>/<function>cpu_to_le32()</function>/<function>le32_to_cpu()</function> - <filename class="headerfile">include/asm/byteorder.h</filename> - - - - The cpu_to_be32() family (where the "32" can - be replaced by 64 or 16, and the "be" can be replaced by "le") are - the general way to do endian conversions in the kernel: they - return the converted value. All variations supply the reverse as - well: be32_to_cpu(), etc. - - - - There are two major variations of these functions: the pointer - variation, such as cpu_to_be32p(), which take - a pointer to the given type, and return the converted value. The - other variation is the "in-situ" family, such as - cpu_to_be32s(), which convert value referred - to by the pointer, and return void. - - - - - <function>local_irq_save()</function>/<function>local_irq_restore()</function> - <filename class="headerfile">include/linux/irqflags.h</filename> - - - - These routines disable hard interrupts on the local CPU, and - restore them. They are reentrant; saving the previous state in - their one unsigned long flags argument. If you - know that interrupts are enabled, you can simply use - local_irq_disable() and - local_irq_enable(). - - - - - <function>local_bh_disable()</function>/<function>local_bh_enable()</function> - <filename class="headerfile">include/linux/interrupt.h</filename> - - - These routines disable soft interrupts on the local CPU, and - restore them. They are reentrant; if soft interrupts were - disabled before, they will still be disabled after this pair - of functions has been called. They prevent softirqs and tasklets - from running on the current CPU. - - - - - <function>smp_processor_id</function>() - <filename class="headerfile">include/asm/smp.h</filename> - - - get_cpu() disables preemption (so you won't - suddenly get moved to another CPU) and returns the current - processor number, between 0 and NR_CPUS. Note - that the CPU numbers are not necessarily continuous. You return - it again with put_cpu() when you are done. - - - If you know you cannot be preempted by another task (ie. you are - in interrupt context, or have preemption disabled) you can use - smp_processor_id(). - - - - - <type>__init</type>/<type>__exit</type>/<type>__initdata</type> - <filename class="headerfile">include/linux/init.h</filename> - - - After boot, the kernel frees up a special section; functions - marked with __init and data structures marked with - __initdata are dropped after boot is complete: similarly - modules discard this memory after initialization. __exit - is used to declare a function which is only required on exit: the - function will be dropped if this file is not compiled as a module. - See the header file for use. Note that it makes no sense for a function - marked with __init to be exported to modules with - EXPORT_SYMBOL() - this will break. - - - - - - <function>__initcall()</function>/<function>module_init()</function> - <filename class="headerfile">include/linux/init.h</filename> - - Many parts of the kernel are well served as a module - (dynamically-loadable parts of the kernel). Using the - module_init() and - module_exit() macros it is easy to write code - without #ifdefs which can operate both as a module or built into - the kernel. - - - - The module_init() macro defines which - function is to be called at module insertion time (if the file is - compiled as a module), or at boot time: if the file is not - compiled as a module the module_init() macro - becomes equivalent to __initcall(), which - through linker magic ensures that the function is called on boot. - - - - The function can return a negative error number to cause - module loading to fail (unfortunately, this has no effect if - the module is compiled into the kernel). This function is - called in user context with interrupts enabled, so it can sleep. - - - - - <function>module_exit()</function> - <filename class="headerfile">include/linux/init.h</filename> - - - This macro defines the function to be called at module removal - time (or never, in the case of the file compiled into the - kernel). It will only be called if the module usage count has - reached zero. This function can also sleep, but cannot fail: - everything must be cleaned up by the time it returns. - - - - Note that this macro is optional: if it is not present, your - module will not be removable (except for 'rmmod -f'). - - - - - <function>try_module_get()</function>/<function>module_put()</function> - <filename class="headerfile">include/linux/module.h</filename> - - - These manipulate the module usage count, to protect against - removal (a module also can't be removed if another module uses one - of its exported symbols: see below). Before calling into module - code, you should call try_module_get() on - that module: if it fails, then the module is being removed and you - should act as if it wasn't there. Otherwise, you can safely enter - the module, and call module_put() when you're - finished. - - - - Most registerable structures have an - owner field, such as in the - file_operations structure. Set this field - to the macro THIS_MODULE. - - - - - - - - Wait Queues - <filename class="headerfile">include/linux/wait.h</filename> - - - [SLEEPS] - - - - A wait queue is used to wait for someone to wake you up when a - certain condition is true. They must be used carefully to ensure - there is no race condition. You declare a - wait_queue_head_t, and then processes which want to - wait for that condition declare a wait_queue_t - referring to themselves, and place that in the queue. - - - - Declaring - - - You declare a wait_queue_head_t using the - DECLARE_WAIT_QUEUE_HEAD() macro, or using the - init_waitqueue_head() routine in your - initialization code. - - - - - Queuing - - - Placing yourself in the waitqueue is fairly complex, because you - must put yourself in the queue before checking the condition. - There is a macro to do this: - wait_event_interruptible() - - include/linux/wait.h The - first argument is the wait queue head, and the second is an - expression which is evaluated; the macro returns - 0 when this expression is true, or - -ERESTARTSYS if a signal is received. - The wait_event() version ignores signals. - - - - - - Waking Up Queued Tasks - - - Call wake_up() - - include/linux/wait.h;, - which will wake up every process in the queue. The exception is - if one has TASK_EXCLUSIVE set, in which case - the remainder of the queue will not be woken. There are other variants - of this basic function available in the same header. - - - - - - Atomic Operations - - - Certain operations are guaranteed atomic on all platforms. The - first class of operations work on atomic_t - - include/asm/atomic.h; this - contains a signed integer (at least 32 bits long), and you must use - these functions to manipulate or read atomic_t variables. - atomic_read() and - atomic_set() get and set the counter, - atomic_add(), - atomic_sub(), - atomic_inc(), - atomic_dec(), and - atomic_dec_and_test() (returns - true if it was decremented to zero). - - - - Yes. It returns true (i.e. != 0) if the - atomic variable is zero. - - - - Note that these functions are slower than normal arithmetic, and - so should not be used unnecessarily. - - - - The second class of atomic operations is atomic bit operations on an - unsigned long, defined in - - include/linux/bitops.h. These - operations generally take a pointer to the bit pattern, and a bit - number: 0 is the least significant bit. - set_bit(), clear_bit() - and change_bit() set, clear, and flip the - given bit. test_and_set_bit(), - test_and_clear_bit() and - test_and_change_bit() do the same thing, - except return true if the bit was previously set; these are - particularly useful for atomically setting flags. - - - - It is possible to call these operations with bit indices greater - than BITS_PER_LONG. The resulting behavior is strange on big-endian - platforms though so it is a good idea not to do this. - - - - - Symbols - - - Within the kernel proper, the normal linking rules apply - (ie. unless a symbol is declared to be file scope with the - static keyword, it can be used anywhere in the - kernel). However, for modules, a special exported symbol table is - kept which limits the entry points to the kernel proper. Modules - can also export symbols. - - - - <function>EXPORT_SYMBOL()</function> - <filename class="headerfile">include/linux/export.h</filename> - - - This is the classic method of exporting a symbol: dynamically - loaded modules will be able to use the symbol as normal. - - - - - <function>EXPORT_SYMBOL_GPL()</function> - <filename class="headerfile">include/linux/export.h</filename> - - - Similar to EXPORT_SYMBOL() except that the - symbols exported by EXPORT_SYMBOL_GPL() can - only be seen by modules with a - MODULE_LICENSE() that specifies a GPL - compatible license. It implies that the function is considered - an internal implementation issue, and not really an interface. - Some maintainers and developers may however - require EXPORT_SYMBOL_GPL() when adding any new APIs or functionality. - - - - - - Routines and Conventions - - - Double-linked lists - <filename class="headerfile">include/linux/list.h</filename> - - - There used to be three sets of linked-list routines in the kernel - headers, but this one is the winner. If you don't have some - particular pressing need for a single list, it's a good choice. - - - - In particular, list_for_each_entry is useful. - - - - - Return Conventions - - - For code called in user context, it's very common to defy C - convention, and return 0 for success, - and a negative error number - (eg. -EFAULT) for failure. This can be - unintuitive at first, but it's fairly widespread in the kernel. - - - - Using ERR_PTR() - - include/linux/err.h; to - encode a negative error number into a pointer, and - IS_ERR() and PTR_ERR() - to get it back out again: avoids a separate pointer parameter for - the error number. Icky, but in a good way. - - - - - Breaking Compilation - - - Linus and the other developers sometimes change function or - structure names in development kernels; this is not done just to - keep everyone on their toes: it reflects a fundamental change - (eg. can no longer be called with interrupts on, or does extra - checks, or doesn't do checks which were caught before). Usually - this is accompanied by a fairly complete note to the linux-kernel - mailing list; search the archive. Simply doing a global replace - on the file usually makes things worse. - - - - - Initializing structure members - - - The preferred method of initializing structures is to use - designated initialisers, as defined by ISO C99, eg: - - -static struct block_device_operations opt_fops = { - .open = opt_open, - .release = opt_release, - .ioctl = opt_ioctl, - .check_media_change = opt_media_change, -}; - - - This makes it easy to grep for, and makes it clear which - structure fields are set. You should do this because it looks - cool. - - - - - GNU Extensions - - - GNU Extensions are explicitly allowed in the Linux kernel. - Note that some of the more complex ones are not very well - supported, due to lack of general use, but the following are - considered standard (see the GCC info page section "C - Extensions" for more details - Yes, really the info page, the - man page is only a short summary of the stuff in info). - - - - - Inline functions - - - - - Statement expressions (ie. the ({ and }) constructs). - - - - - Declaring attributes of a function / variable / type - (__attribute__) - - - - - typeof - - - - - Zero length arrays - - - - - Macro varargs - - - - - Arithmetic on void pointers - - - - - Non-Constant initializers - - - - - Assembler Instructions (not outside arch/ and include/asm/) - - - - - Function names as strings (__func__). - - - - - __builtin_constant_p() - - - - - - Be wary when using long long in the kernel, the code gcc generates for - it is horrible and worse: division and multiplication does not work - on i386 because the GCC runtime functions for it are missing from - the kernel environment. - - - - - - - C++ - - - Using C++ in the kernel is usually a bad idea, because the - kernel does not provide the necessary runtime environment - and the include files are not tested for it. It is still - possible, but not recommended. If you really want to do - this, forget about exceptions at least. - - - - - #if - - - It is generally considered cleaner to use macros in header files - (or at the top of .c files) to abstract away functions rather than - using `#if' pre-processor statements throughout the source code. - - - - - - Putting Your Stuff in the Kernel - - - In order to get your stuff into shape for official inclusion, or - even to make a neat patch, there's administrative work to be - done: - - - - - Figure out whose pond you've been pissing in. Look at the top of - the source files, inside the MAINTAINERS - file, and last of all in the CREDITS file. - You should coordinate with this person to make sure you're not - duplicating effort, or trying something that's already been - rejected. - - - - Make sure you put your name and EMail address at the top of - any files you create or mangle significantly. This is the - first place people will look when they find a bug, or when - they want to make a change. - - - - - - Usually you want a configuration option for your kernel hack. - Edit Kconfig in the appropriate directory. - The Config language is simple to use by cut and paste, and there's - complete documentation in - Documentation/kbuild/kconfig-language.txt. - - - - In your description of the option, make sure you address both the - expert user and the user who knows nothing about your feature. Mention - incompatibilities and issues here. Definitely - end your description with if in doubt, say N - (or, occasionally, `Y'); this is for people who have no - idea what you are talking about. - - - - - - Edit the Makefile: the CONFIG variables are - exported here so you can usually just add a "obj-$(CONFIG_xxx) += - xxx.o" line. The syntax is documented in - Documentation/kbuild/makefiles.txt. - - - - - - Put yourself in CREDITS if you've done - something noteworthy, usually beyond a single file (your name - should be at the top of the source files anyway). - MAINTAINERS means you want to be consulted - when changes are made to a subsystem, and hear about bugs; it - implies a more-than-passing commitment to some part of the code. - - - - - - Finally, don't forget to read Documentation/process/submitting-patches.rst - and possibly Documentation/process/submitting-drivers.rst. - - - - - - - Kernel Cantrips - - - Some favorites from browsing the source. Feel free to add to this - list. - - - - arch/x86/include/asm/delay.h: - - -#define ndelay(n) (__builtin_constant_p(n) ? \ - ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ - __ndelay(n)) - - - - include/linux/fs.h: - - -/* - * Kernel pointers have redundant information, so we can use a - * scheme where we can return either an error code or a dentry - * pointer with the same return value. - * - * This should be a per-architecture thing, to allow different - * error and pointer decisions. - */ - #define ERR_PTR(err) ((void *)((long)(err))) - #define PTR_ERR(ptr) ((long)(ptr)) - #define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000)) - - - - arch/x86/include/asm/uaccess_32.h: - - - -#define copy_to_user(to,from,n) \ - (__builtin_constant_p(n) ? \ - __constant_copy_to_user((to),(from),(n)) : \ - __generic_copy_to_user((to),(from),(n))) - - - - arch/sparc/kernel/head.S: - - - -/* - * Sun people can't spell worth damn. "compatability" indeed. - * At least we *know* we can't spell, and use a spell-checker. - */ - -/* Uh, actually Linus it is I who cannot spell. Too much murky - * Sparc assembly will do this to ya. - */ -C_LABEL(cputypvar): - .asciz "compatibility" - -/* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */ - .align 4 -C_LABEL(cputypvar_sun4m): - .asciz "compatible" - - - - arch/sparc/lib/checksum.S: - - - - /* Sun, you just can't beat me, you just can't. Stop trying, - * give up. I'm serious, I am going to kick the living shit - * out of you, game over, lights out. - */ - - - - - Thanks - - - Thanks to Andi Kleen for the idea, answering my questions, fixing - my mistakes, filling content, etc. Philipp Rumpf for more spelling - and clarity fixes, and some excellent non-obvious points. Werner - Almesberger for giving me a great summary of - disable_irq(), and Jes Sorensen and Andrea - Arcangeli added caveats. Michael Elizabeth Chastain for checking - and adding to the Configure section. Telsa Gwynne for teaching me DocBook. - - -
- diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl deleted file mode 100644 index 7c9cc4846cb6..000000000000 --- a/Documentation/DocBook/kernel-locking.tmpl +++ /dev/null @@ -1,2151 +0,0 @@ - - - - - - Unreliable Guide To Locking - - - - Rusty - Russell - -
- rusty@rustcorp.com.au -
-
-
-
- - - 2003 - Rusty Russell - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - -
- - - - Introduction - - Welcome, to Rusty's Remarkably Unreliable Guide to Kernel - Locking issues. This document describes the locking systems in - the Linux Kernel in 2.6. - - - With the wide availability of HyperThreading, and preemption in the Linux - Kernel, everyone hacking on the kernel needs to know the - fundamentals of concurrency and locking for - SMP. - - - - - The Problem With Concurrency - - (Skip this if you know what a Race Condition is). - - - In a normal program, you can increment a counter like so: - - - very_important_count++; - - - - This is what they would expect to happen: - - - - Expected Results - - - - - - Instance 1 - Instance 2 - - - - - - read very_important_count (5) - - - - add 1 (6) - - - - write very_important_count (6) - - - - - read very_important_count (6) - - - - add 1 (7) - - - - write very_important_count (7) - - - - -
- - - This is what might happen: - - - - Possible Results - - - - - Instance 1 - Instance 2 - - - - - - read very_important_count (5) - - - - - read very_important_count (5) - - - add 1 (6) - - - - - add 1 (6) - - - write very_important_count (6) - - - - - write very_important_count (6) - - - -
- - - Race Conditions and Critical Regions - - This overlap, where the result depends on the - relative timing of multiple tasks, is called a race condition. - The piece of code containing the concurrency issue is called a - critical region. And especially since Linux starting running - on SMP machines, they became one of the major issues in kernel - design and implementation. - - - Preemption can have the same effect, even if there is only one - CPU: by preempting one task during the critical region, we have - exactly the same race condition. In this case the thread which - preempts might run the critical region itself. - - - The solution is to recognize when these simultaneous accesses - occur, and use locks to make sure that only one instance can - enter the critical region at any time. There are many - friendly primitives in the Linux kernel to help you do this. - And then there are the unfriendly primitives, but I'll pretend - they don't exist. - - -
- - - Locking in the Linux Kernel - - - If I could give you one piece of advice: never sleep with anyone - crazier than yourself. But if I had to give you advice on - locking: keep it simple. - - - - Be reluctant to introduce new locks. - - - - Strangely enough, this last one is the exact reverse of my advice when - you have slept with someone crazier than yourself. - And you should think about getting a big dog. - - - - Two Main Types of Kernel Locks: Spinlocks and Mutexes - - - There are two main types of kernel locks. The fundamental type - is the spinlock - (include/asm/spinlock.h), - which is a very simple single-holder lock: if you can't get the - spinlock, you keep trying (spinning) until you can. Spinlocks are - very small and fast, and can be used anywhere. - - - The second type is a mutex - (include/linux/mutex.h): it - is like a spinlock, but you may block holding a mutex. - If you can't lock a mutex, your task will suspend itself, and be woken - up when the mutex is released. This means the CPU can do something - else while you are waiting. There are many cases when you simply - can't sleep (see ), and so have to - use a spinlock instead. - - - Neither type of lock is recursive: see - . - - - - - Locks and Uniprocessor Kernels - - - For kernels compiled without CONFIG_SMP, and - without CONFIG_PREEMPT spinlocks do not exist at - all. This is an excellent design decision: when no-one else can - run at the same time, there is no reason to have a lock. - - - - If the kernel is compiled without CONFIG_SMP, - but CONFIG_PREEMPT is set, then spinlocks - simply disable preemption, which is sufficient to prevent any - races. For most purposes, we can think of preemption as - equivalent to SMP, and not worry about it separately. - - - - You should always test your locking code with CONFIG_SMP - and CONFIG_PREEMPT enabled, even if you don't have an SMP test box, because it - will still catch some kinds of locking bugs. - - - - Mutexes still exist, because they are required for - synchronization between user - contexts, as we will see below. - - - - - Locking Only In User Context - - - If you have a data structure which is only ever accessed from - user context, then you can use a simple mutex - (include/linux/mutex.h) to protect it. This - is the most trivial case: you initialize the mutex. Then you can - call mutex_lock_interruptible() to grab the mutex, - and mutex_unlock() to release it. There is also a - mutex_lock(), which should be avoided, because it - will not return if a signal is received. - - - - Example: net/netfilter/nf_sockopt.c allows - registration of new setsockopt() and - getsockopt() calls, with - nf_register_sockopt(). Registration and - de-registration are only done on module load and unload (and boot - time, where there is no concurrency), and the list of registrations - is only consulted for an unknown setsockopt() - or getsockopt() system call. The - nf_sockopt_mutex is perfect to protect this, - especially since the setsockopt and getsockopt calls may well - sleep. - - - - - Locking Between User Context and Softirqs - - - If a softirq shares - data with user context, you have two problems. Firstly, the current - user context can be interrupted by a softirq, and secondly, the - critical region could be entered from another CPU. This is where - spin_lock_bh() - (include/linux/spinlock.h) is - used. It disables softirqs on that CPU, then grabs the lock. - spin_unlock_bh() does the reverse. (The - '_bh' suffix is a historical reference to "Bottom Halves", the - old name for software interrupts. It should really be - called spin_lock_softirq()' in a perfect world). - - - - Note that you can also use spin_lock_irq() - or spin_lock_irqsave() here, which stop - hardware interrupts as well: see . - - - - This works perfectly for UP - as well: the spin lock vanishes, and this macro - simply becomes local_bh_disable() - (include/linux/interrupt.h), which - protects you from the softirq being run. - - - - - Locking Between User Context and Tasklets - - - This is exactly the same as above, because tasklets are actually run - from a softirq. - - - - - Locking Between User Context and Timers - - - This, too, is exactly the same as above, because timers are actually run from - a softirq. From a locking point of view, tasklets and timers - are identical. - - - - - Locking Between Tasklets/Timers - - - Sometimes a tasklet or timer might want to share data with - another tasklet or timer. - - - - The Same Tasklet/Timer - - Since a tasklet is never run on two CPUs at once, you don't - need to worry about your tasklet being reentrant (running - twice at once), even on SMP. - - - - - Different Tasklets/Timers - - If another tasklet/timer wants - to share data with your tasklet or timer , you will both need to use - spin_lock() and - spin_unlock() calls. - spin_lock_bh() is - unnecessary here, as you are already in a tasklet, and - none will be run on the same CPU. - - - - - - Locking Between Softirqs - - - Often a softirq might - want to share data with itself or a tasklet/timer. - - - - The Same Softirq - - - The same softirq can run on the other CPUs: you can use a - per-CPU array (see ) for better - performance. If you're going so far as to use a softirq, - you probably care about scalable performance enough - to justify the extra complexity. - - - - You'll need to use spin_lock() and - spin_unlock() for shared data. - - - - - Different Softirqs - - - You'll need to use spin_lock() and - spin_unlock() for shared data, whether it - be a timer, tasklet, different softirq or the same or another - softirq: any of them could be running on a different CPU. - - - - - - - Hard IRQ Context - - - Hardware interrupts usually communicate with a - tasklet or softirq. Frequently this involves putting work in a - queue, which the softirq will take out. - - - - Locking Between Hard IRQ and Softirqs/Tasklets - - - If a hardware irq handler shares data with a softirq, you have - two concerns. Firstly, the softirq processing can be - interrupted by a hardware interrupt, and secondly, the - critical region could be entered by a hardware interrupt on - another CPU. This is where spin_lock_irq() is - used. It is defined to disable interrupts on that cpu, then grab - the lock. spin_unlock_irq() does the reverse. - - - - The irq handler does not to use - spin_lock_irq(), because the softirq cannot - run while the irq handler is running: it can use - spin_lock(), which is slightly faster. The - only exception would be if a different hardware irq handler uses - the same lock: spin_lock_irq() will stop - that from interrupting us. - - - - This works perfectly for UP as well: the spin lock vanishes, - and this macro simply becomes local_irq_disable() - (include/asm/smp.h), which - protects you from the softirq/tasklet/BH being run. - - - - spin_lock_irqsave() - (include/linux/spinlock.h) is a variant - which saves whether interrupts were on or off in a flags word, - which is passed to spin_unlock_irqrestore(). This - means that the same code can be used inside an hard irq handler (where - interrupts are already off) and in softirqs (where the irq - disabling is required). - - - - Note that softirqs (and hence tasklets and timers) are run on - return from hardware interrupts, so - spin_lock_irq() also stops these. In that - sense, spin_lock_irqsave() is the most - general and powerful locking function. - - - - - Locking Between Two Hard IRQ Handlers - - It is rare to have to share data between two IRQ handlers, but - if you do, spin_lock_irqsave() should be - used: it is architecture-specific whether all interrupts are - disabled inside irq handlers themselves. - - - - - - - Cheat Sheet For Locking - - Pete Zaitcev gives the following summary: - - - - - If you are in a process context (any syscall) and want to - lock other process out, use a mutex. You can take a mutex - and sleep (copy_from_user*( or - kmalloc(x,GFP_KERNEL)). - - - - - Otherwise (== data can be touched in an interrupt), use - spin_lock_irqsave() and - spin_unlock_irqrestore(). - - - - - Avoid holding spinlock for more than 5 lines of code and - across any function call (except accessors like - readb). - - - - - - Table of Minimum Requirements - - The following table lists the minimum - locking requirements between various contexts. In some cases, - the same context can only be running on one CPU at a time, so - no locking is required for that context (eg. a particular - thread can only run on one CPU at a time, but if it needs - shares data with another thread, locking is required). - - - Remember the advice above: you can always use - spin_lock_irqsave(), which is a superset - of all other spinlock primitives. - - - -Table of Locking Requirements - - - - - -IRQ Handler A -IRQ Handler B -Softirq A -Softirq B -Tasklet A -Tasklet B -Timer A -Timer B -User Context A -User Context B - - - -IRQ Handler A -None - - - -IRQ Handler B -SLIS -None - - - -Softirq A -SLI -SLI -SL - - - -Softirq B -SLI -SLI -SL -SL - - - -Tasklet A -SLI -SLI -SL -SL -None - - - -Tasklet B -SLI -SLI -SL -SL -SL -None - - - -Timer A -SLI -SLI -SL -SL -SL -SL -None - - - -Timer B -SLI -SLI -SL -SL -SL -SL -SL -None - - - -User Context A -SLI -SLI -SLBH -SLBH -SLBH -SLBH -SLBH -SLBH -None - - - -User Context B -SLI -SLI -SLBH -SLBH -SLBH -SLBH -SLBH -SLBH -MLI -None - - - - -
- - -Legend for Locking Requirements Table - - - - -SLIS -spin_lock_irqsave - - -SLI -spin_lock_irq - - -SL -spin_lock - - -SLBH -spin_lock_bh - - -MLI -mutex_lock_interruptible - - - - -
- -
-
- - - The trylock Functions - - There are functions that try to acquire a lock only once and immediately - return a value telling about success or failure to acquire the lock. - They can be used if you need no access to the data protected with the lock - when some other thread is holding the lock. You should acquire the lock - later if you then need access to the data protected with the lock. - - - - spin_trylock() does not spin but returns non-zero if - it acquires the spinlock on the first try or 0 if not. This function can - be used in all contexts like spin_lock: you must have - disabled the contexts that might interrupt you and acquire the spin lock. - - - - mutex_trylock() does not suspend your task - but returns non-zero if it could lock the mutex on the first try - or 0 if not. This function cannot be safely used in hardware or software - interrupt contexts despite not sleeping. - - - - - Common Examples - -Let's step through a simple example: a cache of number to name -mappings. The cache keeps a count of how often each of the objects is -used, and when it gets full, throws out the least used one. - - - - - All In User Context - -For our first example, we assume that all operations are in user -context (ie. from system calls), so we can sleep. This means we can -use a mutex to protect the cache and all the objects within -it. Here's the code: - - - -#include <linux/list.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/mutex.h> -#include <asm/errno.h> - -struct object -{ - struct list_head list; - int id; - char name[32]; - int popularity; -}; - -/* Protects the cache, cache_num, and the objects within it */ -static DEFINE_MUTEX(cache_lock); -static LIST_HEAD(cache); -static unsigned int cache_num = 0; -#define MAX_CACHE_SIZE 10 - -/* Must be holding cache_lock */ -static struct object *__cache_find(int id) -{ - struct object *i; - - list_for_each_entry(i, &cache, list) - if (i->id == id) { - i->popularity++; - return i; - } - return NULL; -} - -/* Must be holding cache_lock */ -static void __cache_delete(struct object *obj) -{ - BUG_ON(!obj); - list_del(&obj->list); - kfree(obj); - cache_num--; -} - -/* Must be holding cache_lock */ -static void __cache_add(struct object *obj) -{ - list_add(&obj->list, &cache); - if (++cache_num > MAX_CACHE_SIZE) { - struct object *i, *outcast = NULL; - list_for_each_entry(i, &cache, list) { - if (!outcast || i->popularity < outcast->popularity) - outcast = i; - } - __cache_delete(outcast); - } -} - -int cache_add(int id, const char *name) -{ - struct object *obj; - - if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) - return -ENOMEM; - - strlcpy(obj->name, name, sizeof(obj->name)); - obj->id = id; - obj->popularity = 0; - - mutex_lock(&cache_lock); - __cache_add(obj); - mutex_unlock(&cache_lock); - return 0; -} - -void cache_delete(int id) -{ - mutex_lock(&cache_lock); - __cache_delete(__cache_find(id)); - mutex_unlock(&cache_lock); -} - -int cache_find(int id, char *name) -{ - struct object *obj; - int ret = -ENOENT; - - mutex_lock(&cache_lock); - obj = __cache_find(id); - if (obj) { - ret = 0; - strcpy(name, obj->name); - } - mutex_unlock(&cache_lock); - return ret; -} - - - -Note that we always make sure we have the cache_lock when we add, -delete, or look up the cache: both the cache infrastructure itself and -the contents of the objects are protected by the lock. In this case -it's easy, since we copy the data for the user, and never let them -access the objects directly. - - -There is a slight (and common) optimization here: in -cache_add we set up the fields of the object -before grabbing the lock. This is safe, as no-one else can access it -until we put it in cache. - - - - - Accessing From Interrupt Context - -Now consider the case where cache_find can be -called from interrupt context: either a hardware interrupt or a -softirq. An example would be a timer which deletes object from the -cache. - - -The change is shown below, in standard patch format: the -- are lines which are taken away, and the -+ are lines which are added. - - ---- cache.c.usercontext 2003-12-09 13:58:54.000000000 +1100 -+++ cache.c.interrupt 2003-12-09 14:07:49.000000000 +1100 -@@ -12,7 +12,7 @@ - int popularity; - }; - --static DEFINE_MUTEX(cache_lock); -+static DEFINE_SPINLOCK(cache_lock); - static LIST_HEAD(cache); - static unsigned int cache_num = 0; - #define MAX_CACHE_SIZE 10 -@@ -55,6 +55,7 @@ - int cache_add(int id, const char *name) - { - struct object *obj; -+ unsigned long flags; - - if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) - return -ENOMEM; -@@ -63,30 +64,33 @@ - obj->id = id; - obj->popularity = 0; - -- mutex_lock(&cache_lock); -+ spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); -- mutex_unlock(&cache_lock); -+ spin_unlock_irqrestore(&cache_lock, flags); - return 0; - } - - void cache_delete(int id) - { -- mutex_lock(&cache_lock); -+ unsigned long flags; -+ -+ spin_lock_irqsave(&cache_lock, flags); - __cache_delete(__cache_find(id)); -- mutex_unlock(&cache_lock); -+ spin_unlock_irqrestore(&cache_lock, flags); - } - - int cache_find(int id, char *name) - { - struct object *obj; - int ret = -ENOENT; -+ unsigned long flags; - -- mutex_lock(&cache_lock); -+ spin_lock_irqsave(&cache_lock, flags); - obj = __cache_find(id); - if (obj) { - ret = 0; - strcpy(name, obj->name); - } -- mutex_unlock(&cache_lock); -+ spin_unlock_irqrestore(&cache_lock, flags); - return ret; - } - - - -Note that the spin_lock_irqsave will turn off -interrupts if they are on, otherwise does nothing (if we are already -in an interrupt handler), hence these functions are safe to call from -any context. - - -Unfortunately, cache_add calls -kmalloc with the GFP_KERNEL -flag, which is only legal in user context. I have assumed that -cache_add is still only called in user context, -otherwise this should become a parameter to -cache_add. - - - - Exposing Objects Outside This File - -If our objects contained more information, it might not be sufficient -to copy the information in and out: other parts of the code might want -to keep pointers to these objects, for example, rather than looking up -the id every time. This produces two problems. - - -The first problem is that we use the cache_lock to -protect objects: we'd need to make this non-static so the rest of the -code can use it. This makes locking trickier, as it is no longer all -in one place. - - -The second problem is the lifetime problem: if another structure keeps -a pointer to an object, it presumably expects that pointer to remain -valid. Unfortunately, this is only guaranteed while you hold the -lock, otherwise someone might call cache_delete -and even worse, add another object, re-using the same address. - - -As there is only one lock, you can't hold it forever: no-one else would -get any work done. - - -The solution to this problem is to use a reference count: everyone who -has a pointer to the object increases it when they first get the -object, and drops the reference count when they're finished with it. -Whoever drops it to zero knows it is unused, and can actually delete it. - - -Here is the code: - - - ---- cache.c.interrupt 2003-12-09 14:25:43.000000000 +1100 -+++ cache.c.refcnt 2003-12-09 14:33:05.000000000 +1100 -@@ -7,6 +7,7 @@ - struct object - { - struct list_head list; -+ unsigned int refcnt; - int id; - char name[32]; - int popularity; -@@ -17,6 +18,35 @@ - static unsigned int cache_num = 0; - #define MAX_CACHE_SIZE 10 - -+static void __object_put(struct object *obj) -+{ -+ if (--obj->refcnt == 0) -+ kfree(obj); -+} -+ -+static void __object_get(struct object *obj) -+{ -+ obj->refcnt++; -+} -+ -+void object_put(struct object *obj) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&cache_lock, flags); -+ __object_put(obj); -+ spin_unlock_irqrestore(&cache_lock, flags); -+} -+ -+void object_get(struct object *obj) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&cache_lock, flags); -+ __object_get(obj); -+ spin_unlock_irqrestore(&cache_lock, flags); -+} -+ - /* Must be holding cache_lock */ - static struct object *__cache_find(int id) - { -@@ -35,6 +65,7 @@ - { - BUG_ON(!obj); - list_del(&obj->list); -+ __object_put(obj); - cache_num--; - } - -@@ -63,6 +94,7 @@ - strlcpy(obj->name, name, sizeof(obj->name)); - obj->id = id; - obj->popularity = 0; -+ obj->refcnt = 1; /* The cache holds a reference */ - - spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); -@@ -79,18 +111,15 @@ - spin_unlock_irqrestore(&cache_lock, flags); - } - --int cache_find(int id, char *name) -+struct object *cache_find(int id) - { - struct object *obj; -- int ret = -ENOENT; - unsigned long flags; - - spin_lock_irqsave(&cache_lock, flags); - obj = __cache_find(id); -- if (obj) { -- ret = 0; -- strcpy(name, obj->name); -- } -+ if (obj) -+ __object_get(obj); - spin_unlock_irqrestore(&cache_lock, flags); -- return ret; -+ return obj; - } - - - -We encapsulate the reference counting in the standard 'get' and 'put' -functions. Now we can return the object itself from -cache_find which has the advantage that the user -can now sleep holding the object (eg. to -copy_to_user to name to userspace). - - -The other point to note is that I said a reference should be held for -every pointer to the object: thus the reference count is 1 when first -inserted into the cache. In some versions the framework does not hold -a reference count, but they are more complicated. - - - - Using Atomic Operations For The Reference Count - -In practice, atomic_t would usually be used for -refcnt. There are a number of atomic -operations defined in - -include/asm/atomic.h: these are -guaranteed to be seen atomically from all CPUs in the system, so no -lock is required. In this case, it is simpler than using spinlocks, -although for anything non-trivial using spinlocks is clearer. The -atomic_inc and -atomic_dec_and_test are used instead of the -standard increment and decrement operators, and the lock is no longer -used to protect the reference count itself. - - - ---- cache.c.refcnt 2003-12-09 15:00:35.000000000 +1100 -+++ cache.c.refcnt-atomic 2003-12-11 15:49:42.000000000 +1100 -@@ -7,7 +7,7 @@ - struct object - { - struct list_head list; -- unsigned int refcnt; -+ atomic_t refcnt; - int id; - char name[32]; - int popularity; -@@ -18,33 +18,15 @@ - static unsigned int cache_num = 0; - #define MAX_CACHE_SIZE 10 - --static void __object_put(struct object *obj) --{ -- if (--obj->refcnt == 0) -- kfree(obj); --} -- --static void __object_get(struct object *obj) --{ -- obj->refcnt++; --} -- - void object_put(struct object *obj) - { -- unsigned long flags; -- -- spin_lock_irqsave(&cache_lock, flags); -- __object_put(obj); -- spin_unlock_irqrestore(&cache_lock, flags); -+ if (atomic_dec_and_test(&obj->refcnt)) -+ kfree(obj); - } - - void object_get(struct object *obj) - { -- unsigned long flags; -- -- spin_lock_irqsave(&cache_lock, flags); -- __object_get(obj); -- spin_unlock_irqrestore(&cache_lock, flags); -+ atomic_inc(&obj->refcnt); - } - - /* Must be holding cache_lock */ -@@ -65,7 +47,7 @@ - { - BUG_ON(!obj); - list_del(&obj->list); -- __object_put(obj); -+ object_put(obj); - cache_num--; - } - -@@ -94,7 +76,7 @@ - strlcpy(obj->name, name, sizeof(obj->name)); - obj->id = id; - obj->popularity = 0; -- obj->refcnt = 1; /* The cache holds a reference */ -+ atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ - - spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); -@@ -119,7 +101,7 @@ - spin_lock_irqsave(&cache_lock, flags); - obj = __cache_find(id); - if (obj) -- __object_get(obj); -+ object_get(obj); - spin_unlock_irqrestore(&cache_lock, flags); - return obj; - } - - - - - - Protecting The Objects Themselves - -In these examples, we assumed that the objects (except the reference -counts) never changed once they are created. If we wanted to allow -the name to change, there are three possibilities: - - - - -You can make cache_lock non-static, and tell people -to grab that lock before changing the name in any object. - - - - -You can provide a cache_obj_rename which grabs -this lock and changes the name for the caller, and tell everyone to -use that function. - - - - -You can make the cache_lock protect only the cache -itself, and use another lock to protect the name. - - - - - -Theoretically, you can make the locks as fine-grained as one lock for -every field, for every object. In practice, the most common variants -are: - - - - -One lock which protects the infrastructure (the cache -list in this example) and all the objects. This is what we have done -so far. - - - - -One lock which protects the infrastructure (including the list -pointers inside the objects), and one lock inside the object which -protects the rest of that object. - - - - -Multiple locks to protect the infrastructure (eg. one lock per hash -chain), possibly with a separate per-object lock. - - - - - -Here is the "lock-per-object" implementation: - - ---- cache.c.refcnt-atomic 2003-12-11 15:50:54.000000000 +1100 -+++ cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100 -@@ -6,11 +6,17 @@ - - struct object - { -+ /* These two protected by cache_lock. */ - struct list_head list; -+ int popularity; -+ - atomic_t refcnt; -+ -+ /* Doesn't change once created. */ - int id; -+ -+ spinlock_t lock; /* Protects the name */ - char name[32]; -- int popularity; - }; - - static DEFINE_SPINLOCK(cache_lock); -@@ -77,6 +84,7 @@ - obj->id = id; - obj->popularity = 0; - atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ -+ spin_lock_init(&obj->lock); - - spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); - - - -Note that I decide that the popularity -count should be protected by the cache_lock rather -than the per-object lock: this is because it (like the -struct list_head inside the object) is -logically part of the infrastructure. This way, I don't need to grab -the lock of every object in __cache_add when -seeking the least popular. - - - -I also decided that the id member is -unchangeable, so I don't need to grab each object lock in -__cache_find() to examine the -id: the object lock is only used by a -caller who wants to read or write the name -field. - - - -Note also that I added a comment describing what data was protected by -which locks. This is extremely important, as it describes the runtime -behavior of the code, and can be hard to gain from just reading. And -as Alan Cox says, Lock data, not code. - - - - - - Common Problems - - Deadlock: Simple and Advanced - - - There is a coding bug where a piece of code tries to grab a - spinlock twice: it will spin forever, waiting for the lock to - be released (spinlocks, rwlocks and mutexes are not - recursive in Linux). This is trivial to diagnose: not a - stay-up-five-nights-talk-to-fluffy-code-bunnies kind of - problem. - - - - For a slightly more complex case, imagine you have a region - shared by a softirq and user context. If you use a - spin_lock() call to protect it, it is - possible that the user context will be interrupted by the softirq - while it holds the lock, and the softirq will then spin - forever trying to get the same lock. - - - - Both of these are called deadlock, and as shown above, it can - occur even with a single CPU (although not on UP compiles, - since spinlocks vanish on kernel compiles with - CONFIG_SMP=n. You'll still get data corruption - in the second example). - - - - This complete lockup is easy to diagnose: on SMP boxes the - watchdog timer or compiling with DEBUG_SPINLOCK set - (include/linux/spinlock.h) will show this up - immediately when it happens. - - - - A more complex problem is the so-called 'deadly embrace', - involving two or more locks. Say you have a hash table: each - entry in the table is a spinlock, and a chain of hashed - objects. Inside a softirq handler, you sometimes want to - alter an object from one place in the hash to another: you - grab the spinlock of the old hash chain and the spinlock of - the new hash chain, and delete the object from the old one, - and insert it in the new one. - - - - There are two problems here. First, if your code ever - tries to move the object to the same chain, it will deadlock - with itself as it tries to lock it twice. Secondly, if the - same softirq on another CPU is trying to move another object - in the reverse direction, the following could happen: - - - - Consequences - - - - - - CPU 1 - CPU 2 - - - - - - Grab lock A -> OK - Grab lock B -> OK - - - Grab lock B -> spin - Grab lock A -> spin - - - -
- - - The two CPUs will spin forever, waiting for the other to give up - their lock. It will look, smell, and feel like a crash. - -
- - - Preventing Deadlock - - - Textbooks will tell you that if you always lock in the same - order, you will never get this kind of deadlock. Practice - will tell you that this approach doesn't scale: when I - create a new lock, I don't understand enough of the kernel - to figure out where in the 5000 lock hierarchy it will fit. - - - - The best locks are encapsulated: they never get exposed in - headers, and are never held around calls to non-trivial - functions outside the same file. You can read through this - code and see that it will never deadlock, because it never - tries to grab another lock while it has that one. People - using your code don't even need to know you are using a - lock. - - - - A classic problem here is when you provide callbacks or - hooks: if you call these with the lock held, you risk simple - deadlock, or a deadly embrace (who knows what the callback - will do?). Remember, the other programmers are out to get - you, so don't do this. - - - - Overzealous Prevention Of Deadlocks - - - Deadlocks are problematic, but not as bad as data - corruption. Code which grabs a read lock, searches a list, - fails to find what it wants, drops the read lock, grabs a - write lock and inserts the object has a race condition. - - - - If you don't see why, please stay the fuck away from my code. - - - - - - Racing Timers: A Kernel Pastime - - - Timers can produce their own special problems with races. - Consider a collection of objects (list, hash, etc) where each - object has a timer which is due to destroy it. - - - - If you want to destroy the entire collection (say on module - removal), you might do the following: - - - - /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE - HUNGARIAN NOTATION */ - spin_lock_bh(&list_lock); - - while (list) { - struct foo *next = list->next; - del_timer(&list->timer); - kfree(list); - list = next; - } - - spin_unlock_bh(&list_lock); - - - - Sooner or later, this will crash on SMP, because a timer can - have just gone off before the spin_lock_bh(), - and it will only get the lock after we - spin_unlock_bh(), and then try to free - the element (which has already been freed!). - - - - This can be avoided by checking the result of - del_timer(): if it returns - 1, the timer has been deleted. - If 0, it means (in this - case) that it is currently running, so we can do: - - - - retry: - spin_lock_bh(&list_lock); - - while (list) { - struct foo *next = list->next; - if (!del_timer(&list->timer)) { - /* Give timer a chance to delete this */ - spin_unlock_bh(&list_lock); - goto retry; - } - kfree(list); - list = next; - } - - spin_unlock_bh(&list_lock); - - - - Another common problem is deleting timers which restart - themselves (by calling add_timer() at the end - of their timer function). Because this is a fairly common case - which is prone to races, you should use del_timer_sync() - (include/linux/timer.h) - to handle this case. It returns the number of times the timer - had to be deleted before we finally stopped it from adding itself back - in. - - - -
- - - Locking Speed - - -There are three main things to worry about when considering speed of -some code which does locking. First is concurrency: how many things -are going to be waiting while someone else is holding a lock. Second -is the time taken to actually acquire and release an uncontended lock. -Third is using fewer, or smarter locks. I'm assuming that the lock is -used fairly often: otherwise, you wouldn't be concerned about -efficiency. - - -Concurrency depends on how long the lock is usually held: you should -hold the lock for as long as needed, but no longer. In the cache -example, we always create the object without the lock held, and then -grab the lock only when we are ready to insert it in the list. - - -Acquisition times depend on how much damage the lock operations do to -the pipeline (pipeline stalls) and how likely it is that this CPU was -the last one to grab the lock (ie. is the lock cache-hot for this -CPU): on a machine with more CPUs, this likelihood drops fast. -Consider a 700MHz Intel Pentium III: an instruction takes about 0.7ns, -an atomic increment takes about 58ns, a lock which is cache-hot on -this CPU takes 160ns, and a cacheline transfer from another CPU takes -an additional 170 to 360ns. (These figures from Paul McKenney's - Linux -Journal RCU article). - - -These two aims conflict: holding a lock for a short time might be done -by splitting locks into parts (such as in our final per-object-lock -example), but this increases the number of lock acquisitions, and the -results are often slower than having a single lock. This is another -reason to advocate locking simplicity. - - -The third concern is addressed below: there are some methods to reduce -the amount of locking which needs to be done. - - - - Read/Write Lock Variants - - - Both spinlocks and mutexes have read/write variants: - rwlock_t and struct rw_semaphore. - These divide users into two classes: the readers and the writers. If - you are only reading the data, you can get a read lock, but to write to - the data you need the write lock. Many people can hold a read lock, - but a writer must be sole holder. - - - - If your code divides neatly along reader/writer lines (as our - cache code does), and the lock is held by readers for - significant lengths of time, using these locks can help. They - are slightly slower than the normal locks though, so in practice - rwlock_t is not usually worthwhile. - - - - - Avoiding Locks: Read Copy Update - - - There is a special method of read/write locking called Read Copy - Update. Using RCU, the readers can avoid taking a lock - altogether: as we expect our cache to be read more often than - updated (otherwise the cache is a waste of time), it is a - candidate for this optimization. - - - - How do we get rid of read locks? Getting rid of read locks - means that writers may be changing the list underneath the - readers. That is actually quite simple: we can read a linked - list while an element is being added if the writer adds the - element very carefully. For example, adding - new to a single linked list called - list: - - - - new->next = list->next; - wmb(); - list->next = new; - - - - The wmb() is a write memory barrier. It - ensures that the first operation (setting the new element's - next pointer) is complete and will be seen by - all CPUs, before the second operation is (putting the new - element into the list). This is important, since modern - compilers and modern CPUs can both reorder instructions unless - told otherwise: we want a reader to either not see the new - element at all, or see the new element with the - next pointer correctly pointing at the rest of - the list. - - - Fortunately, there is a function to do this for standard - struct list_head lists: - list_add_rcu() - (include/linux/list.h). - - - Removing an element from the list is even simpler: we replace - the pointer to the old element with a pointer to its successor, - and readers will either see it, or skip over it. - - - list->next = old->next; - - - There is list_del_rcu() - (include/linux/list.h) which does this (the - normal version poisons the old object, which we don't want). - - - The reader must also be careful: some CPUs can look through the - next pointer to start reading the contents of - the next element early, but don't realize that the pre-fetched - contents is wrong when the next pointer changes - underneath them. Once again, there is a - list_for_each_entry_rcu() - (include/linux/list.h) to help you. Of - course, writers can just use - list_for_each_entry(), since there cannot - be two simultaneous writers. - - - Our final dilemma is this: when can we actually destroy the - removed element? Remember, a reader might be stepping through - this element in the list right now: if we free this element and - the next pointer changes, the reader will jump - off into garbage and crash. We need to wait until we know that - all the readers who were traversing the list when we deleted the - element are finished. We use call_rcu() to - register a callback which will actually destroy the object once - all pre-existing readers are finished. Alternatively, - synchronize_rcu() may be used to block until - all pre-existing are finished. - - - But how does Read Copy Update know when the readers are - finished? The method is this: firstly, the readers always - traverse the list inside - rcu_read_lock()/rcu_read_unlock() - pairs: these simply disable preemption so the reader won't go to - sleep while reading the list. - - - RCU then waits until every other CPU has slept at least once: - since readers cannot sleep, we know that any readers which were - traversing the list during the deletion are finished, and the - callback is triggered. The real Read Copy Update code is a - little more optimized than this, but this is the fundamental - idea. - - - ---- cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100 -+++ cache.c.rcupdate 2003-12-11 17:55:14.000000000 +1100 -@@ -1,15 +1,18 @@ - #include <linux/list.h> - #include <linux/slab.h> - #include <linux/string.h> -+#include <linux/rcupdate.h> - #include <linux/mutex.h> - #include <asm/errno.h> - - struct object - { -- /* These two protected by cache_lock. */ -+ /* This is protected by RCU */ - struct list_head list; - int popularity; - -+ struct rcu_head rcu; -+ - atomic_t refcnt; - - /* Doesn't change once created. */ -@@ -40,7 +43,7 @@ - { - struct object *i; - -- list_for_each_entry(i, &cache, list) { -+ list_for_each_entry_rcu(i, &cache, list) { - if (i->id == id) { - i->popularity++; - return i; -@@ -49,19 +52,25 @@ - return NULL; - } - -+/* Final discard done once we know no readers are looking. */ -+static void cache_delete_rcu(void *arg) -+{ -+ object_put(arg); -+} -+ - /* Must be holding cache_lock */ - static void __cache_delete(struct object *obj) - { - BUG_ON(!obj); -- list_del(&obj->list); -- object_put(obj); -+ list_del_rcu(&obj->list); - cache_num--; -+ call_rcu(&obj->rcu, cache_delete_rcu); - } - - /* Must be holding cache_lock */ - static void __cache_add(struct object *obj) - { -- list_add(&obj->list, &cache); -+ list_add_rcu(&obj->list, &cache); - if (++cache_num > MAX_CACHE_SIZE) { - struct object *i, *outcast = NULL; - list_for_each_entry(i, &cache, list) { -@@ -104,12 +114,11 @@ - struct object *cache_find(int id) - { - struct object *obj; -- unsigned long flags; - -- spin_lock_irqsave(&cache_lock, flags); -+ rcu_read_lock(); - obj = __cache_find(id); - if (obj) - object_get(obj); -- spin_unlock_irqrestore(&cache_lock, flags); -+ rcu_read_unlock(); - return obj; - } - - - -Note that the reader will alter the -popularity member in -__cache_find(), and now it doesn't hold a lock. -One solution would be to make it an atomic_t, but for -this usage, we don't really care about races: an approximate result is -good enough, so I didn't change it. - - - -The result is that cache_find() requires no -synchronization with any other functions, so is almost as fast on SMP -as it would be on UP. - - - -There is a further optimization possible here: remember our original -cache code, where there were no reference counts and the caller simply -held the lock whenever using the object? This is still possible: if -you hold the lock, no one can delete the object, so you don't need to -get and put the reference count. - - - -Now, because the 'read lock' in RCU is simply disabling preemption, a -caller which always has preemption disabled between calling -cache_find() and -object_put() does not need to actually get and -put the reference count: we could expose -__cache_find() by making it non-static, and -such callers could simply call that. - - -The benefit here is that the reference count is not written to: the -object is not altered in any way, which is much faster on SMP -machines due to caching. - - - - - Per-CPU Data - - - Another technique for avoiding locking which is used fairly - widely is to duplicate information for each CPU. For example, - if you wanted to keep a count of a common condition, you could - use a spin lock and a single counter. Nice and simple. - - - - If that was too slow (it's usually not, but if you've got a - really big machine to test on and can show that it is), you - could instead use a counter for each CPU, then none of them need - an exclusive lock. See DEFINE_PER_CPU(), - get_cpu_var() and - put_cpu_var() - (include/linux/percpu.h). - - - - Of particular use for simple per-cpu counters is the - local_t type, and the - cpu_local_inc() and related functions, - which are more efficient than simple code on some architectures - (include/asm/local.h). - - - - Note that there is no simple, reliable way of getting an exact - value of such a counter, without introducing more locks. This - is not a problem for some uses. - - - - - Data Which Mostly Used By An IRQ Handler - - - If data is always accessed from within the same IRQ handler, you - don't need a lock at all: the kernel already guarantees that the - irq handler will not run simultaneously on multiple CPUs. - - - Manfred Spraul points out that you can still do this, even if - the data is very occasionally accessed in user context or - softirqs/tasklets. The irq handler doesn't use a lock, and - all other accesses are done as so: - - - - spin_lock(&lock); - disable_irq(irq); - ... - enable_irq(irq); - spin_unlock(&lock); - - - The disable_irq() prevents the irq handler - from running (and waits for it to finish if it's currently - running on other CPUs). The spinlock prevents any other - accesses happening at the same time. Naturally, this is slower - than just a spin_lock_irq() call, so it - only makes sense if this type of access happens extremely - rarely. - - - - - - What Functions Are Safe To Call From Interrupts? - - - Many functions in the kernel sleep (ie. call schedule()) - directly or indirectly: you can never call them while holding a - spinlock, or with preemption disabled. This also means you need - to be in user context: calling them from an interrupt is illegal. - - - - Some Functions Which Sleep - - - The most common ones are listed below, but you usually have to - read the code to find out if other calls are safe. If everyone - else who calls it can sleep, you probably need to be able to - sleep, too. In particular, registration and deregistration - functions usually expect to be called from user context, and can - sleep. - - - - - - Accesses to - userspace: - - - - - copy_from_user() - - - - - copy_to_user() - - - - - get_user() - - - - - put_user() - - - - - - - - kmalloc(GFP_KERNEL) - - - - - - mutex_lock_interruptible() and - mutex_lock() - - - There is a mutex_trylock() which does not - sleep. Still, it must not be used inside interrupt context since - its implementation is not safe for that. - mutex_unlock() will also never sleep. - It cannot be used in interrupt context either since a mutex - must be released by the same task that acquired it. - - - - - - - Some Functions Which Don't Sleep - - - Some functions are safe to call from any context, or holding - almost any lock. - - - - - - printk() - - - - - kfree() - - - - - add_timer() and del_timer() - - - - - - - - Mutex API reference -!Iinclude/linux/mutex.h -!Ekernel/locking/mutex.c - - - - Futex API reference -!Ikernel/futex.c - - - - Further reading - - - - - Documentation/locking/spinlocks.txt: - Linus Torvalds' spinlocking tutorial in the kernel sources. - - - - - - Unix Systems for Modern Architectures: Symmetric - Multiprocessing and Caching for Kernel Programmers: - - - - Curt Schimmel's very good introduction to kernel level - locking (not written for Linux, but nearly everything - applies). The book is expensive, but really worth every - penny to understand SMP locking. [ISBN: 0201633388] - - - - - - - Thanks - - - Thanks to Telsa Gwynne for DocBooking, neatening and adding - style. - - - - Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul - Mackerras, Ruedi Aschwanden, Alan Cox, Manfred Spraul, Tim - Waugh, Pete Zaitcev, James Morris, Robert Love, Paul McKenney, - John Ashby for proofreading, correcting, flaming, commenting. - - - - Thanks to the cabal for having no influence on this document. - - - - - Glossary - - - preemption - - - Prior to 2.5, or when CONFIG_PREEMPT is - unset, processes in user context inside the kernel would not - preempt each other (ie. you had that CPU until you gave it up, - except for interrupts). With the addition of - CONFIG_PREEMPT in 2.5.4, this changed: when - in user context, higher priority tasks can "cut in": spinlocks - were changed to disable preemption, even on UP. - - - - - - bh - - - Bottom Half: for historical reasons, functions with - '_bh' in them often now refer to any software interrupt, e.g. - spin_lock_bh() blocks any software interrupt - on the current CPU. Bottom halves are deprecated, and will - eventually be replaced by tasklets. Only one bottom half will be - running at any time. - - - - - - Hardware Interrupt / Hardware IRQ - - - Hardware interrupt request. in_irq() returns - true in a hardware interrupt handler. - - - - - - Interrupt Context - - - Not user context: processing a hardware irq or software irq. - Indicated by the in_interrupt() macro - returning true. - - - - - - SMP - - - Symmetric Multi-Processor: kernels compiled for multiple-CPU - machines. (CONFIG_SMP=y). - - - - - - Software Interrupt / softirq - - - Software interrupt handler. in_irq() returns - false; in_softirq() - returns true. Tasklets and softirqs - both fall into the category of 'software interrupts'. - - - Strictly speaking a softirq is one of up to 32 enumerated software - interrupts which can run on multiple CPUs at once. - Sometimes used to refer to tasklets as - well (ie. all software interrupts). - - - - - - tasklet - - - A dynamically-registrable software interrupt, - which is guaranteed to only run on one CPU at a time. - - - - - - timer - - - A dynamically-registrable software interrupt, which is run at - (or close to) a given time. When running, it is just like a - tasklet (in fact, they are called from the TIMER_SOFTIRQ). - - - - - - UP - - - Uni-Processor: Non-SMP. (CONFIG_SMP=n). - - - - - - User Context - - - The kernel executing on behalf of a particular process (ie. a - system call or trap) or kernel thread. You can tell which - process with the current macro.) Not to - be confused with userspace. Can be interrupted by software or - hardware interrupts. - - - - - - Userspace - - - A process executing its own code outside the kernel. - - - - - -
- diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl deleted file mode 100644 index 856ac20bf367..000000000000 --- a/Documentation/DocBook/kgdb.tmpl +++ /dev/null @@ -1,918 +0,0 @@ - - - - - - Using kgdb, kdb and the kernel debugger internals - - - - Jason - Wessel - -
- jason.wessel@windriver.com -
-
-
-
- - 2008,2010 - Wind River Systems, Inc. - - - 2004-2005 - MontaVista Software, Inc. - - - 2004 - Amit S. Kale - - - - - This file is licensed under the terms of the GNU General Public License - version 2. This program is licensed "as is" without any warranty of any - kind, whether express or implied. - - - -
- - - - Introduction - - The kernel has two different debugger front ends (kdb and kgdb) - which interface to the debug core. It is possible to use either - of the debugger front ends and dynamically transition between them - if you configure the kernel properly at compile and runtime. - - - Kdb is simplistic shell-style interface which you can use on a - system console with a keyboard or serial console. You can use it - to inspect memory, registers, process lists, dmesg, and even set - breakpoints to stop in a certain location. Kdb is not a source - level debugger, although you can set breakpoints and execute some - basic kernel run control. Kdb is mainly aimed at doing some - analysis to aid in development or diagnosing kernel problems. You - can access some symbols by name in kernel built-ins or in kernel - modules if the code was built - with CONFIG_KALLSYMS. - - - Kgdb is intended to be used as a source level debugger for the - Linux kernel. It is used along with gdb to debug a Linux kernel. - The expectation is that gdb can be used to "break in" to the - kernel to inspect memory, variables and look through call stack - information similar to the way an application developer would use - gdb to debug an application. It is possible to place breakpoints - in kernel code and perform some limited execution stepping. - - - Two machines are required for using kgdb. One of these machines is - a development machine and the other is the target machine. The - kernel to be debugged runs on the target machine. The development - machine runs an instance of gdb against the vmlinux file which - contains the symbols (not a boot image such as bzImage, zImage, - uImage...). In gdb the developer specifies the connection - parameters and connects to kgdb. The type of connection a - developer makes with gdb depends on the availability of kgdb I/O - modules compiled as built-ins or loadable kernel modules in the test - machine's kernel. - - - - Compiling a kernel - - - In order to enable compilation of kdb, you must first enable kgdb. - The kgdb test compile options are described in the kgdb test suite chapter. - - - - Kernel config options for kgdb - - To enable CONFIG_KGDB you should look under - "Kernel hacking" / "Kernel debugging" and select "KGDB: kernel debugger". - - - While it is not a hard requirement that you have symbols in your - vmlinux file, gdb tends not to be very useful without the symbolic - data, so you will want to turn - on CONFIG_DEBUG_INFO which is called "Compile the - kernel with debug info" in the config menu. - - - It is advised, but not required, that you turn on the - CONFIG_FRAME_POINTER kernel option which is called "Compile the - kernel with frame pointers" in the config menu. This option - inserts code to into the compiled executable which saves the frame - information in registers or on the stack at different points which - allows a debugger such as gdb to more accurately construct - stack back traces while debugging the kernel. - - - If the architecture that you are using supports the kernel option - CONFIG_STRICT_KERNEL_RWX, you should consider turning it off. This - option will prevent the use of software breakpoints because it - marks certain regions of the kernel's memory space as read-only. - If kgdb supports it for the architecture you are using, you can - use hardware breakpoints if you desire to run with the - CONFIG_STRICT_KERNEL_RWX option turned on, else you need to turn off - this option. - - - Next you should choose one of more I/O drivers to interconnect - debugging host and debugged target. Early boot debugging requires - a KGDB I/O driver that supports early debugging and the driver - must be built into the kernel directly. Kgdb I/O driver - configuration takes place via kernel or module parameters which - you can learn more about in the in the section that describes the - parameter "kgdboc". - - Here is an example set of .config symbols to enable or - disable for kgdb: - - # CONFIG_STRICT_KERNEL_RWX is not set - CONFIG_FRAME_POINTER=y - CONFIG_KGDB=y - CONFIG_KGDB_SERIAL_CONSOLE=y - - - - - Kernel config options for kdb - Kdb is quite a bit more complex than the simple gdbstub - sitting on top of the kernel's debug core. Kdb must implement a - shell, and also adds some helper functions in other parts of the - kernel, responsible for printing out interesting data such as what - you would see if you ran "lsmod", or "ps". In order to build kdb - into the kernel you follow the same steps as you would for kgdb. - - The main config option for kdb - is CONFIG_KGDB_KDB which is called "KGDB_KDB: - include kdb frontend for kgdb" in the config menu. In theory you - would have already also selected an I/O driver such as the - CONFIG_KGDB_SERIAL_CONSOLE interface if you plan on using kdb on a - serial port, when you were configuring kgdb. - - If you want to use a PS/2-style keyboard with kdb, you would - select CONFIG_KDB_KEYBOARD which is called "KGDB_KDB: keyboard as - input device" in the config menu. The CONFIG_KDB_KEYBOARD option - is not used for anything in the gdb interface to kgdb. The - CONFIG_KDB_KEYBOARD option only works with kdb. - - Here is an example set of .config symbols to enable/disable kdb: - - # CONFIG_STRICT_KERNEL_RWX is not set - CONFIG_FRAME_POINTER=y - CONFIG_KGDB=y - CONFIG_KGDB_SERIAL_CONSOLE=y - CONFIG_KGDB_KDB=y - CONFIG_KDB_KEYBOARD=y - - - - - - Kernel Debugger Boot Arguments - This section describes the various runtime kernel - parameters that affect the configuration of the kernel debugger. - The following chapter covers using kdb and kgdb as well as - providing some examples of the configuration parameters. - - Kernel parameter: kgdboc - The kgdboc driver was originally an abbreviation meant to - stand for "kgdb over console". Today it is the primary mechanism - to configure how to communicate from gdb to kgdb as well as the - devices you want to use to interact with the kdb shell. - - For kgdb/gdb, kgdboc is designed to work with a single serial - port. It is intended to cover the circumstance where you want to - use a serial console as your primary console as well as using it to - perform kernel debugging. It is also possible to use kgdb on a - serial port which is not designated as a system console. Kgdboc - may be configured as a kernel built-in or a kernel loadable module. - You can only make use of kgdbwait and early - debugging if you build kgdboc into the kernel as a built-in. - - Optionally you can elect to activate kms (Kernel Mode - Setting) integration. When you use kms with kgdboc and you have a - video driver that has atomic mode setting hooks, it is possible to - enter the debugger on the graphics console. When the kernel - execution is resumed, the previous graphics mode will be restored. - This integration can serve as a useful tool to aid in diagnosing - crashes or doing analysis of memory with kdb while allowing the - full graphics console applications to run. - - - kgdboc arguments - Usage: kgdboc=[kms][[,]kbd][[,]serial_device][,baud] - The order listed above must be observed if you use any of the - optional configurations together. - - Abbreviations: - - kms = Kernel Mode Setting - kbd = Keyboard - - - You can configure kgdboc to use the keyboard, and/or a serial - device depending on if you are using kdb and/or kgdb, in one of the - following scenarios. The order listed above must be observed if - you use any of the optional configurations together. Using kms + - only gdb is generally not a useful combination. - - Using loadable module or built-in - - - As a kernel built-in: - Use the kernel boot argument: kgdboc=<tty-device>,[baud] - - As a kernel loadable module: - Use the command: modprobe kgdboc kgdboc=<tty-device>,[baud] - Here are two examples of how you might format the kgdboc - string. The first is for an x86 target using the first serial port. - The second example is for the ARM Versatile AB using the second - serial port. - - kgdboc=ttyS0,115200 - kgdboc=ttyAMA1,115200 - - - - - - - Configure kgdboc at runtime with sysfs - At run time you can enable or disable kgdboc by echoing a - parameters into the sysfs. Here are two examples: - - Enable kgdboc on ttyS0 - echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc - Disable kgdboc - echo "" > /sys/module/kgdboc/parameters/kgdboc - - NOTE: You do not need to specify the baud if you are - configuring the console on tty which is already configured or - open. - - - More examples - You can configure kgdboc to use the keyboard, and/or a serial device - depending on if you are using kdb and/or kgdb, in one of the - following scenarios. - - kdb and kgdb over only a serial port - kgdboc=<serial_device>[,baud] - Example: kgdboc=ttyS0,115200 - - kdb and kgdb with keyboard and a serial port - kgdboc=kbd,<serial_device>[,baud] - Example: kgdboc=kbd,ttyS0,115200 - - kdb with a keyboard - kgdboc=kbd - - kdb with kernel mode setting - kgdboc=kms,kbd - - kdb with kernel mode setting and kgdb over a serial port - kgdboc=kms,kbd,ttyS0,115200 - - - - NOTE: Kgdboc does not support interrupting the target via the - gdb remote protocol. You must manually send a sysrq-g unless you - have a proxy that splits console output to a terminal program. - A console proxy has a separate TCP port for the debugger and a separate - TCP port for the "human" console. The proxy can take care of sending - the sysrq-g for you. - - When using kgdboc with no debugger proxy, you can end up - connecting the debugger at one of two entry points. If an - exception occurs after you have loaded kgdboc, a message should - print on the console stating it is waiting for the debugger. In - this case you disconnect your terminal program and then connect the - debugger in its place. If you want to interrupt the target system - and forcibly enter a debug session you have to issue a Sysrq - sequence and then type the letter g. Then - you disconnect the terminal session and connect gdb. Your options - if you don't like this are to hack gdb to send the sysrq-g for you - as well as on the initial connect, or to use a debugger proxy that - allows an unmodified gdb to do the debugging. - - - - - - Kernel parameter: kgdbwait - - The Kernel command line option kgdbwait makes - kgdb wait for a debugger connection during booting of a kernel. You - can only use this option if you compiled a kgdb I/O driver into the - kernel and you specified the I/O driver configuration as a kernel - command line option. The kgdbwait parameter should always follow the - configuration parameter for the kgdb I/O driver in the kernel - command line else the I/O driver will not be configured prior to - asking the kernel to use it to wait. - - - The kernel will stop and wait as early as the I/O driver and - architecture allows when you use this option. If you build the - kgdb I/O driver as a loadable kernel module kgdbwait will not do - anything. - - - - Kernel parameter: kgdbcon - The kgdbcon feature allows you to see printk() messages - inside gdb while gdb is connected to the kernel. Kdb does not make - use of the kgdbcon feature. - - Kgdb supports using the gdb serial protocol to send console - messages to the debugger when the debugger is connected and running. - There are two ways to activate this feature. - - Activate with the kernel command line option: - kgdbcon - - Use sysfs before configuring an I/O driver - - echo 1 > /sys/module/kgdb/parameters/kgdb_use_con - - - NOTE: If you do this after you configure the kgdb I/O driver, the - setting will not take effect until the next point the I/O is - reconfigured. - - - - - IMPORTANT NOTE: You cannot use kgdboc + kgdbcon on a tty that is an - active system console. An example of incorrect usage is console=ttyS0,115200 kgdboc=ttyS0 kgdbcon - - It is possible to use this option with kgdboc on a tty that is not a system console. - - - - Run time parameter: kgdbreboot - The kgdbreboot feature allows you to change how the debugger - deals with the reboot notification. You have 3 choices for the - behavior. The default behavior is always set to 0. - - echo -1 > /sys/module/debug_core/parameters/kgdbreboot - Ignore the reboot notification entirely. - - echo 0 > /sys/module/debug_core/parameters/kgdbreboot - Send the detach message to any attached debugger client. - - echo 1 > /sys/module/debug_core/parameters/kgdbreboot - Enter the debugger on reboot notify. - - - - - - Using kdb - - - - Quick start for kdb on a serial port - This is a quick example of how to use kdb. - - Configure kgdboc at boot using kernel parameters: - - console=ttyS0,115200 kgdboc=ttyS0,115200 - - OR - Configure kgdboc after the kernel has booted; assuming you are using a serial port console: - - echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc - - - - Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config. - - When logged in as root or with a super user session you can run: - echo g > /proc/sysrq-trigger - Example using minicom 2.2 - Press: Control-a - Press: f - Press: g - - When you have telneted to a terminal server that supports sending a remote break - Press: Control-] - Type in:send break - Press: Enter - Press: g - - - - From the kdb prompt you can run the "help" command to see a complete list of the commands that are available. - Some useful commands in kdb include: - - lsmod -- Shows where kernel modules are loaded - ps -- Displays only the active processes - ps A -- Shows all the processes - summary -- Shows kernel version info and memory usage - bt -- Get a backtrace of the current process using dump_stack() - dmesg -- View the kernel syslog buffer - go -- Continue the system - - - - - When you are done using kdb you need to consider rebooting the - system or using the "go" command to resuming normal kernel - execution. If you have paused the kernel for a lengthy period of - time, applications that rely on timely networking or anything to do - with real wall clock time could be adversely affected, so you - should take this into consideration when using the kernel - debugger. - - - - - Quick start for kdb using a keyboard connected console - This is a quick example of how to use kdb with a keyboard. - - Configure kgdboc at boot using kernel parameters: - - kgdboc=kbd - - OR - Configure kgdboc after the kernel has booted: - - echo kbd > /sys/module/kgdboc/parameters/kgdboc - - - - Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config. - - When logged in as root or with a super user session you can run: - echo g > /proc/sysrq-trigger - Example using a laptop keyboard - Press and hold down: Alt - Press and hold down: Fn - Press and release the key with the label: SysRq - Release: Fn - Press and release: g - Release: Alt - - Example using a PS/2 101-key keyboard - Press and hold down: Alt - Press and release the key with the label: SysRq - Press and release: g - Release: Alt - - - - - Now type in a kdb command such as "help", "dmesg", "bt" or "go" to continue kernel execution. - - - - - - Using kgdb / gdb - In order to use kgdb you must activate it by passing - configuration information to one of the kgdb I/O drivers. If you - do not pass any configuration information kgdb will not do anything - at all. Kgdb will only actively hook up to the kernel trap hooks - if a kgdb I/O driver is loaded and configured. If you unconfigure - a kgdb I/O driver, kgdb will unregister all the kernel hook points. - - All kgdb I/O drivers can be reconfigured at run time, if - CONFIG_SYSFS and CONFIG_MODULES - are enabled, by echo'ing a new config string to - /sys/module/<driver>/parameter/<option>. - The driver can be unconfigured by passing an empty string. You cannot - change the configuration while the debugger is attached. Make sure - to detach the debugger with the detach command - prior to trying to unconfigure a kgdb I/O driver. - - - Connecting with gdb to a serial port - - Configure kgdboc - Configure kgdboc at boot using kernel parameters: - - kgdboc=ttyS0,115200 - - OR - Configure kgdboc after the kernel has booted: - - echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc - - - - Stop kernel execution (break into the debugger) - In order to connect to gdb via kgdboc, the kernel must - first be stopped. There are several ways to stop the kernel which - include using kgdbwait as a boot argument, via a sysrq-g, or running - the kernel until it takes an exception where it waits for the - debugger to attach. - - When logged in as root or with a super user session you can run: - echo g > /proc/sysrq-trigger - Example using minicom 2.2 - Press: Control-a - Press: f - Press: g - - When you have telneted to a terminal server that supports sending a remote break - Press: Control-] - Type in:send break - Press: Enter - Press: g - - - - - - Connect from gdb - - Example (using a directly connected port): - - - % gdb ./vmlinux - (gdb) set remotebaud 115200 - (gdb) target remote /dev/ttyS0 - - - Example (kgdb to a terminal server on TCP port 2012): - - - % gdb ./vmlinux - (gdb) target remote 192.168.2.2:2012 - - - Once connected, you can debug a kernel the way you would debug an - application program. - - - If you are having problems connecting or something is going - seriously wrong while debugging, it will most often be the case - that you want to enable gdb to be verbose about its target - communications. You do this prior to issuing the target - remote command by typing in: set debug remote 1 - - - - Remember if you continue in gdb, and need to "break in" again, - you need to issue an other sysrq-g. It is easy to create a simple - entry point by putting a breakpoint at sys_sync - and then you can run "sync" from a shell or script to break into the - debugger. - - - - kgdb and kdb interoperability - It is possible to transition between kdb and kgdb dynamically. - The debug core will remember which you used the last time and - automatically start in the same mode. - - Switching between kdb and kgdb - - Switching from kgdb to kdb - - There are two ways to switch from kgdb to kdb: you can use gdb to - issue a maintenance packet, or you can blindly type the command $3#33. - Whenever the kernel debugger stops in kgdb mode it will print the - message KGDB or $3#33 for KDB. It is important - to note that you have to type the sequence correctly in one pass. - You cannot type a backspace or delete because kgdb will interpret - that as part of the debug stream. - - Change from kgdb to kdb by blindly typing: - $3#33 - Change from kgdb to kdb with gdb - maintenance packet 3 - NOTE: Now you must kill gdb. Typically you press control-z and - issue the command: kill -9 % - - - - - Change from kdb to kgdb - There are two ways you can change from kdb to kgdb. You can - manually enter kgdb mode by issuing the kgdb command from the kdb - shell prompt, or you can connect gdb while the kdb shell prompt is - active. The kdb shell looks for the typical first commands that gdb - would issue with the gdb remote protocol and if it sees one of those - commands it automatically changes into kgdb mode. - - From kdb issue the command: - kgdb - Now disconnect your terminal program and connect gdb in its place - At the kdb prompt, disconnect the terminal program and connect gdb in its place. - - - - - Running kdb commands from gdb - It is possible to run a limited set of kdb commands from gdb, - using the gdb monitor command. You don't want to execute any of the - run control or breakpoint operations, because it can disrupt the - state of the kernel debugger. You should be using gdb for - breakpoints and run control operations if you have gdb connected. - The more useful commands to run are things like lsmod, dmesg, ps or - possibly some of the memory information commands. To see all the kdb - commands you can run monitor help. - Example: - -(gdb) monitor ps -1 idle process (state I) and -27 sleeping system daemon (state M) processes suppressed, -use 'ps A' to see all. -Task Addr Pid Parent [*] cpu State Thread Command - -0xc78291d0 1 0 0 0 S 0xc7829404 init -0xc7954150 942 1 0 0 S 0xc7954384 dropbear -0xc78789c0 944 1 0 0 S 0xc7878bf4 sh -(gdb) - - - - - - kgdb Test Suite - - When kgdb is enabled in the kernel config you can also elect to - enable the config parameter KGDB_TESTS. Turning this on will - enable a special kgdb I/O module which is designed to test the - kgdb internal functions. - - - The kgdb tests are mainly intended for developers to test the kgdb - internals as well as a tool for developing a new kgdb architecture - specific implementation. These tests are not really for end users - of the Linux kernel. The primary source of documentation would be - to look in the drivers/misc/kgdbts.c file. - - - The kgdb test suite can also be configured at compile time to run - the core set of tests by setting the kernel config parameter - KGDB_TESTS_ON_BOOT. This particular option is aimed at automated - regression testing and does not require modifying the kernel boot - config arguments. If this is turned on, the kgdb test suite can - be disabled by specifying "kgdbts=" as a kernel boot argument. - - - - Kernel Debugger Internals - - Architecture Specifics - - The kernel debugger is organized into a number of components: - - The debug core - - The debug core is found in kernel/debugger/debug_core.c. It contains: - - A generic OS exception handler which includes - sync'ing the processors into a stopped state on an multi-CPU - system. - The API to talk to the kgdb I/O drivers - The API to make calls to the arch-specific kgdb implementation - The logic to perform safe memory reads and writes to memory while using the debugger - A full implementation for software breakpoints unless overridden by the arch - The API to invoke either the kdb or kgdb frontend to the debug core. - The structures and callback API for atomic kernel mode setting. - NOTE: kgdboc is where the kms callbacks are invoked. - - - - kgdb arch-specific implementation - - This implementation is generally found in arch/*/kernel/kgdb.c. - As an example, arch/x86/kernel/kgdb.c contains the specifics to - implement HW breakpoint as well as the initialization to - dynamically register and unregister for the trap handlers on - this architecture. The arch-specific portion implements: - - contains an arch-specific trap catcher which - invokes kgdb_handle_exception() to start kgdb about doing its - work - translation to and from gdb specific packet format to pt_regs - Registration and unregistration of architecture specific trap hooks - Any special exception handling and cleanup - NMI exception handling and cleanup - (optional) HW breakpoints - - - - gdbstub frontend (aka kgdb) - The gdbstub is located in kernel/debug/gdbstub.c. It contains: - - All the logic to implement the gdb serial protocol - - - kdb frontend - The kdb debugger shell is broken down into a number of - components. The kdb core is located in kernel/debug/kdb. There - are a number of helper functions in some of the other kernel - components to make it possible for kdb to examine and report - information about the kernel without taking locks that could - cause a kernel deadlock. The kdb core contains implements the following functionality. - - A simple shell - The kdb core command set - A registration API to register additional kdb shell commands. - - A good example of a self-contained kdb module - is the "ftdump" command for dumping the ftrace buffer. See: - kernel/trace/trace_kdb.c - For an example of how to dynamically register - a new kdb command you can build the kdb_hello.ko kernel module - from samples/kdb/kdb_hello.c. To build this example you can - set CONFIG_SAMPLES=y and CONFIG_SAMPLE_KDB=m in your kernel - config. Later run "modprobe kdb_hello" and the next time you - enter the kdb shell, you can run the "hello" - command. - - The implementation for kdb_printf() which - emits messages directly to I/O drivers, bypassing the kernel - log. - SW / HW breakpoint management for the kdb shell - - - kgdb I/O driver - - Each kgdb I/O driver has to provide an implementation for the following: - - configuration via built-in or module - dynamic configuration and kgdb hook registration calls - read and write character interface - A cleanup handler for unconfiguring from the kgdb core - (optional) Early debug methodology - - Any given kgdb I/O driver has to operate very closely with the - hardware and must do it in such a way that does not enable - interrupts or change other parts of the system context without - completely restoring them. The kgdb core will repeatedly "poll" - a kgdb I/O driver for characters when it needs input. The I/O - driver is expected to return immediately if there is no data - available. Doing so allows for the future possibility to touch - watchdog hardware in such a way as to have a target system not - reset when these are enabled. - - - - - - If you are intent on adding kgdb architecture specific support - for a new architecture, the architecture should define - HAVE_ARCH_KGDB in the architecture specific - Kconfig file. This will enable kgdb for the architecture, and - at that point you must create an architecture specific kgdb - implementation. - - - There are a few flags which must be set on every architecture in - their <asm/kgdb.h> file. These are: - - - - NUMREGBYTES: The size in bytes of all of the registers, so - that we can ensure they will all fit into a packet. - - - - - BUFMAX: The size in bytes of the buffer GDB will read into. - This must be larger than NUMREGBYTES. - - - - - CACHE_FLUSH_IS_SAFE: Set to 1 if it is always safe to call - flush_cache_range or flush_icache_range. On some architectures, - these functions may not be safe to call on SMP since we keep other - CPUs in a holding pattern. - - - - - - There are also the following functions for the common backend, - found in kernel/kgdb.c, that must be supplied by the - architecture-specific backend unless marked as (optional), in - which case a default function maybe used if the architecture - does not need to provide a specific implementation. - -!Iinclude/linux/kgdb.h - - - kgdboc internals - - kgdboc and uarts - - The kgdboc driver is actually a very thin driver that relies on the - underlying low level to the hardware driver having "polling hooks" - to which the tty driver is attached. In the initial - implementation of kgdboc the serial_core was changed to expose a - low level UART hook for doing polled mode reading and writing of a - single character while in an atomic context. When kgdb makes an I/O - request to the debugger, kgdboc invokes a callback in the serial - core which in turn uses the callback in the UART driver. - - When using kgdboc with a UART, the UART driver must implement two callbacks in the struct uart_ops. Example from drivers/8250.c: -#ifdef CONFIG_CONSOLE_POLL - .poll_get_char = serial8250_get_poll_char, - .poll_put_char = serial8250_put_poll_char, -#endif - - Any implementation specifics around creating a polling driver use the - #ifdef CONFIG_CONSOLE_POLL, as shown above. - Keep in mind that polling hooks have to be implemented in such a way - that they can be called from an atomic context and have to restore - the state of the UART chip on return such that the system can return - to normal when the debugger detaches. You need to be very careful - with any kind of lock you consider, because failing here is most likely - going to mean pressing the reset button. - - - - kgdboc and keyboards - The kgdboc driver contains logic to configure communications - with an attached keyboard. The keyboard infrastructure is only - compiled into the kernel when CONFIG_KDB_KEYBOARD=y is set in the - kernel configuration. - The core polled keyboard driver driver for PS/2 type keyboards - is in drivers/char/kdb_keyboard.c. This driver is hooked into the - debug core when kgdboc populates the callback in the array - called kdb_poll_funcs[]. The - kdb_get_kbd_char() is the top-level function which polls hardware - for single character input. - - - - kgdboc and kms - The kgdboc driver contains logic to request the graphics - display to switch to a text context when you are using - "kgdboc=kms,kbd", provided that you have a video driver which has a - frame buffer console and atomic kernel mode setting support. - - Every time the kernel - debugger is entered it calls kgdboc_pre_exp_handler() which in turn - calls con_debug_enter() in the virtual console layer. On resuming kernel - execution, the kernel debugger calls kgdboc_post_exp_handler() which - in turn calls con_debug_leave(). - Any video driver that wants to be compatible with the kernel - debugger and the atomic kms callbacks must implement the - mode_set_base_atomic, fb_debug_enter and fb_debug_leave operations. - For the fb_debug_enter and fb_debug_leave the option exists to use - the generic drm fb helper functions or implement something custom for - the hardware. The following example shows the initialization of the - .mode_set_base_atomic operation in - drivers/gpu/drm/i915/intel_display.c: - - -static const struct drm_crtc_helper_funcs intel_helper_funcs = { -[...] - .mode_set_base_atomic = intel_pipe_set_base_atomic, -[...] -}; - - - - Here is an example of how the i915 driver initializes the fb_debug_enter and fb_debug_leave functions to use the generic drm helpers in - drivers/gpu/drm/i915/intel_fb.c: - - -static struct fb_ops intelfb_ops = { -[...] - .fb_debug_enter = drm_fb_helper_debug_enter, - .fb_debug_leave = drm_fb_helper_debug_leave, -[...] -}; - - - - - - - - Credits - - The following people have contributed to this document: - - Amit Kaleamitkale@linsyssoft.com - Tom Rinitrini@kernel.crashing.org - - In March 2008 this document was completely rewritten by: - - Jason Wesseljason.wessel@windriver.com - - In Jan 2010 this document was updated to include kdb. - - Jason Wesseljason.wessel@windriver.com - - - -
- diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl deleted file mode 100644 index 0320910b866d..000000000000 --- a/Documentation/DocBook/libata.tmpl +++ /dev/null @@ -1,1625 +0,0 @@ - - - - - - libATA Developer's Guide - - - - Jeff - Garzik - - - - - 2003-2006 - Jeff Garzik - - - - - The contents of this file are subject to the Open - Software License version 1.1 that can be found at - http://fedoraproject.org/wiki/Licensing:OSL1.1 - and is included herein by reference. - - - - Alternatively, the contents of this file may be used under the terms - of the GNU General Public License version 2 (the "GPL") as distributed - in the kernel source COPYING file, in which case the provisions of - the GPL are applicable instead of the above. If you wish to allow - the use of your version of this file only under the terms of the - GPL and not to allow others to use your version of this file under - the OSL, indicate your decision by deleting the provisions above and - replace them with the notice and other provisions required by the GPL. - If you do not delete the provisions above, a recipient may use your - version of this file under either the OSL or the GPL. - - - - - - - - - Introduction - - libATA is a library used inside the Linux kernel to support ATA host - controllers and devices. libATA provides an ATA driver API, class - transports for ATA and ATAPI devices, and SCSI<->ATA translation - for ATA devices according to the T10 SAT specification. - - - This Guide documents the libATA driver API, library functions, library - internals, and a couple sample ATA low-level drivers. - - - - - libata Driver API - - struct ata_port_operations is defined for every low-level libata - hardware driver, and it controls how the low-level driver - interfaces with the ATA and SCSI layers. - - - FIS-based drivers will hook into the system with ->qc_prep() and - ->qc_issue() high-level hooks. Hardware which behaves in a manner - similar to PCI IDE hardware may utilize several generic helpers, - defining at a bare minimum the bus I/O addresses of the ATA shadow - register blocks. - - - struct ata_port_operations - - Disable ATA port - -void (*port_disable) (struct ata_port *); - - - - Called from ata_bus_probe() error path, as well as when - unregistering from the SCSI module (rmmod, hot unplug). - This function should do whatever needs to be done to take the - port out of use. In most cases, ata_port_disable() can be used - as this hook. - - - Called from ata_bus_probe() on a failed probe. - Called from ata_scsi_release(). - - - - - Post-IDENTIFY device configuration - -void (*dev_config) (struct ata_port *, struct ata_device *); - - - - Called after IDENTIFY [PACKET] DEVICE is issued to each device - found. Typically used to apply device-specific fixups prior to - issue of SET FEATURES - XFER MODE, and prior to operation. - - - This entry may be specified as NULL in ata_port_operations. - - - - - Set PIO/DMA mode - -void (*set_piomode) (struct ata_port *, struct ata_device *); -void (*set_dmamode) (struct ata_port *, struct ata_device *); -void (*post_set_mode) (struct ata_port *); -unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned int); - - - - Hooks called prior to the issue of SET FEATURES - XFER MODE - command. The optional ->mode_filter() hook is called when libata - has built a mask of the possible modes. This is passed to the - ->mode_filter() function which should return a mask of valid modes - after filtering those unsuitable due to hardware limits. It is not - valid to use this interface to add modes. - - - dev->pio_mode and dev->dma_mode are guaranteed to be valid when - ->set_piomode() and when ->set_dmamode() is called. The timings for - any other drive sharing the cable will also be valid at this point. - That is the library records the decisions for the modes of each - drive on a channel before it attempts to set any of them. - - - ->post_set_mode() is - called unconditionally, after the SET FEATURES - XFER MODE - command completes successfully. - - - - ->set_piomode() is always called (if present), but - ->set_dma_mode() is only called if DMA is possible. - - - - - Taskfile read/write - -void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf); -void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf); - - - - ->tf_load() is called to load the given taskfile into hardware - registers / DMA buffers. ->tf_read() is called to read the - hardware registers / DMA buffers, to obtain the current set of - taskfile register values. - Most drivers for taskfile-based hardware (PIO or MMIO) use - ata_sff_tf_load() and ata_sff_tf_read() for these hooks. - - - - - PIO data read/write - -void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int); - - - -All bmdma-style drivers must implement this hook. This is the low-level -operation that actually copies the data bytes during a PIO data -transfer. -Typically the driver will choose one of ata_sff_data_xfer_noirq(), -ata_sff_data_xfer(), or ata_sff_data_xfer32(). - - - - - ATA command execute - -void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf); - - - - causes an ATA command, previously loaded with - ->tf_load(), to be initiated in hardware. - Most drivers for taskfile-based hardware use ata_sff_exec_command() - for this hook. - - - - - Per-cmd ATAPI DMA capabilities filter - -int (*check_atapi_dma) (struct ata_queued_cmd *qc); - - - -Allow low-level driver to filter ATA PACKET commands, returning a status -indicating whether or not it is OK to use DMA for the supplied PACKET -command. - - - This hook may be specified as NULL, in which case libata will - assume that atapi dma can be supported. - - - - - Read specific ATA shadow registers - -u8 (*sff_check_status)(struct ata_port *ap); -u8 (*sff_check_altstatus)(struct ata_port *ap); - - - - Reads the Status/AltStatus ATA shadow register from - hardware. On some hardware, reading the Status register has - the side effect of clearing the interrupt condition. - Most drivers for taskfile-based hardware use - ata_sff_check_status() for this hook. - - - - - Write specific ATA shadow register - -void (*sff_set_devctl)(struct ata_port *ap, u8 ctl); - - - - Write the device control ATA shadow register to the hardware. - Most drivers don't need to define this. - - - - - Select ATA device on bus - -void (*sff_dev_select)(struct ata_port *ap, unsigned int device); - - - - Issues the low-level hardware command(s) that causes one of N - hardware devices to be considered 'selected' (active and - available for use) on the ATA bus. This generally has no - meaning on FIS-based devices. - - - Most drivers for taskfile-based hardware use - ata_sff_dev_select() for this hook. - - - - - Private tuning method - -void (*set_mode) (struct ata_port *ap); - - - - By default libata performs drive and controller tuning in - accordance with the ATA timing rules and also applies blacklists - and cable limits. Some controllers need special handling and have - custom tuning rules, typically raid controllers that use ATA - commands but do not actually do drive timing. - - - - - This hook should not be used to replace the standard controller - tuning logic when a controller has quirks. Replacing the default - tuning logic in that case would bypass handling for drive and - bridge quirks that may be important to data reliability. If a - controller needs to filter the mode selection it should use the - mode_filter hook instead. - - - - - - Control PCI IDE BMDMA engine - -void (*bmdma_setup) (struct ata_queued_cmd *qc); -void (*bmdma_start) (struct ata_queued_cmd *qc); -void (*bmdma_stop) (struct ata_port *ap); -u8 (*bmdma_status) (struct ata_port *ap); - - - -When setting up an IDE BMDMA transaction, these hooks arm -(->bmdma_setup), fire (->bmdma_start), and halt (->bmdma_stop) -the hardware's DMA engine. ->bmdma_status is used to read the standard -PCI IDE DMA Status register. - - - -These hooks are typically either no-ops, or simply not implemented, in -FIS-based drivers. - - -Most legacy IDE drivers use ata_bmdma_setup() for the bmdma_setup() -hook. ata_bmdma_setup() will write the pointer to the PRD table to -the IDE PRD Table Address register, enable DMA in the DMA Command -register, and call exec_command() to begin the transfer. - - -Most legacy IDE drivers use ata_bmdma_start() for the bmdma_start() -hook. ata_bmdma_start() will write the ATA_DMA_START flag to the DMA -Command register. - - -Many legacy IDE drivers use ata_bmdma_stop() for the bmdma_stop() -hook. ata_bmdma_stop() clears the ATA_DMA_START flag in the DMA -command register. - - -Many legacy IDE drivers use ata_bmdma_status() as the bmdma_status() hook. - - - - - High-level taskfile hooks - -void (*qc_prep) (struct ata_queued_cmd *qc); -int (*qc_issue) (struct ata_queued_cmd *qc); - - - - Higher-level hooks, these two hooks can potentially supercede - several of the above taskfile/DMA engine hooks. ->qc_prep is - called after the buffers have been DMA-mapped, and is typically - used to populate the hardware's DMA scatter-gather table. - Most drivers use the standard ata_qc_prep() helper function, but - more advanced drivers roll their own. - - - ->qc_issue is used to make a command active, once the hardware - and S/G tables have been prepared. IDE BMDMA drivers use the - helper function ata_qc_issue_prot() for taskfile protocol-based - dispatch. More advanced drivers implement their own ->qc_issue. - - - ata_qc_issue_prot() calls ->tf_load(), ->bmdma_setup(), and - ->bmdma_start() as necessary to initiate a transfer. - - - - - Exception and probe handling (EH) - -void (*eng_timeout) (struct ata_port *ap); -void (*phy_reset) (struct ata_port *ap); - - - -Deprecated. Use ->error_handler() instead. - - - -void (*freeze) (struct ata_port *ap); -void (*thaw) (struct ata_port *ap); - - - -ata_port_freeze() is called when HSM violations or some other -condition disrupts normal operation of the port. A frozen port -is not allowed to perform any operation until the port is -thawed, which usually follows a successful reset. - - - -The optional ->freeze() callback can be used for freezing the port -hardware-wise (e.g. mask interrupt and stop DMA engine). If a -port cannot be frozen hardware-wise, the interrupt handler -must ack and clear interrupts unconditionally while the port -is frozen. - - -The optional ->thaw() callback is called to perform the opposite of ->freeze(): -prepare the port for normal operation once again. Unmask interrupts, -start DMA engine, etc. - - - -void (*error_handler) (struct ata_port *ap); - - - -->error_handler() is a driver's hook into probe, hotplug, and recovery -and other exceptional conditions. The primary responsibility of an -implementation is to call ata_do_eh() or ata_bmdma_drive_eh() with a set -of EH hooks as arguments: - - - -'prereset' hook (may be NULL) is called during an EH reset, before any other actions -are taken. - - - -'postreset' hook (may be NULL) is called after the EH reset is performed. Based on -existing conditions, severity of the problem, and hardware capabilities, - - - -Either 'softreset' (may be NULL) or 'hardreset' (may be NULL) will be -called to perform the low-level EH reset. - - - -void (*post_internal_cmd) (struct ata_queued_cmd *qc); - - - -Perform any hardware-specific actions necessary to finish processing -after executing a probe-time or EH-time command via ata_exec_internal(). - - - - - Hardware interrupt handling - -irqreturn_t (*irq_handler)(int, void *, struct pt_regs *); -void (*irq_clear) (struct ata_port *); - - - - ->irq_handler is the interrupt handling routine registered with - the system, by libata. ->irq_clear is called during probe just - before the interrupt handler is registered, to be sure hardware - is quiet. - - - The second argument, dev_instance, should be cast to a pointer - to struct ata_host_set. - - - Most legacy IDE drivers use ata_sff_interrupt() for the - irq_handler hook, which scans all ports in the host_set, - determines which queued command was active (if any), and calls - ata_sff_host_intr(ap,qc). - - - Most legacy IDE drivers use ata_sff_irq_clear() for the - irq_clear() hook, which simply clears the interrupt and error - flags in the DMA status register. - - - - - SATA phy read/write - -int (*scr_read) (struct ata_port *ap, unsigned int sc_reg, - u32 *val); -int (*scr_write) (struct ata_port *ap, unsigned int sc_reg, - u32 val); - - - - Read and write standard SATA phy registers. Currently only used - if ->phy_reset hook called the sata_phy_reset() helper function. - sc_reg is one of SCR_STATUS, SCR_CONTROL, SCR_ERROR, or SCR_ACTIVE. - - - - - Init and shutdown - -int (*port_start) (struct ata_port *ap); -void (*port_stop) (struct ata_port *ap); -void (*host_stop) (struct ata_host_set *host_set); - - - - ->port_start() is called just after the data structures for each - port are initialized. Typically this is used to alloc per-port - DMA buffers / tables / rings, enable DMA engines, and similar - tasks. Some drivers also use this entry point as a chance to - allocate driver-private memory for ap->private_data. - - - Many drivers use ata_port_start() as this hook or call - it from their own port_start() hooks. ata_port_start() - allocates space for a legacy IDE PRD table and returns. - - - ->port_stop() is called after ->host_stop(). Its sole function - is to release DMA/memory resources, now that they are no longer - actively being used. Many drivers also free driver-private - data from port at this time. - - - ->host_stop() is called after all ->port_stop() calls -have completed. The hook must finalize hardware shutdown, release DMA -and other resources, etc. - This hook may be specified as NULL, in which case it is not called. - - - - - - - - - Error handling - - - This chapter describes how errors are handled under libata. - Readers are advised to read SCSI EH - (Documentation/scsi/scsi_eh.txt) and ATA exceptions doc first. - - - Origins of commands - - In libata, a command is represented with struct ata_queued_cmd - or qc. qc's are preallocated during port initialization and - repetitively used for command executions. Currently only one - qc is allocated per port but yet-to-be-merged NCQ branch - allocates one for each tag and maps each qc to NCQ tag 1-to-1. - - - libata commands can originate from two sources - libata itself - and SCSI midlayer. libata internal commands are used for - initialization and error handling. All normal blk requests - and commands for SCSI emulation are passed as SCSI commands - through queuecommand callback of SCSI host template. - - - - How commands are issued - - - - Internal commands - - - First, qc is allocated and initialized using - ata_qc_new_init(). Although ata_qc_new_init() doesn't - implement any wait or retry mechanism when qc is not - available, internal commands are currently issued only during - initialization and error recovery, so no other command is - active and allocation is guaranteed to succeed. - - - Once allocated qc's taskfile is initialized for the command to - be executed. qc currently has two mechanisms to notify - completion. One is via qc->complete_fn() callback and the - other is completion qc->waiting. qc->complete_fn() callback - is the asynchronous path used by normal SCSI translated - commands and qc->waiting is the synchronous (issuer sleeps in - process context) path used by internal commands. - - - Once initialization is complete, host_set lock is acquired - and the qc is issued. - - - - - SCSI commands - - - All libata drivers use ata_scsi_queuecmd() as - hostt->queuecommand callback. scmds can either be simulated - or translated. No qc is involved in processing a simulated - scmd. The result is computed right away and the scmd is - completed. - - - For a translated scmd, ata_qc_new_init() is invoked to - allocate a qc and the scmd is translated into the qc. SCSI - midlayer's completion notification function pointer is stored - into qc->scsidone. - - - qc->complete_fn() callback is used for completion - notification. ATA commands use ata_scsi_qc_complete() while - ATAPI commands use atapi_qc_complete(). Both functions end up - calling qc->scsidone to notify upper layer when the qc is - finished. After translation is completed, the qc is issued - with ata_qc_issue(). - - - Note that SCSI midlayer invokes hostt->queuecommand while - holding host_set lock, so all above occur while holding - host_set lock. - - - - - - - - How commands are processed - - Depending on which protocol and which controller are used, - commands are processed differently. For the purpose of - discussion, a controller which uses taskfile interface and all - standard callbacks is assumed. - - - Currently 6 ATA command protocols are used. They can be - sorted into the following four categories according to how - they are processed. - - - - ATA NO DATA or DMA - - - ATA_PROT_NODATA and ATA_PROT_DMA fall into this category. - These types of commands don't require any software - intervention once issued. Device will raise interrupt on - completion. - - - - - ATA PIO - - - ATA_PROT_PIO is in this category. libata currently - implements PIO with polling. ATA_NIEN bit is set to turn - off interrupt and pio_task on ata_wq performs polling and - IO. - - - - - ATAPI NODATA or DMA - - - ATA_PROT_ATAPI_NODATA and ATA_PROT_ATAPI_DMA are in this - category. packet_task is used to poll BSY bit after - issuing PACKET command. Once BSY is turned off by the - device, packet_task transfers CDB and hands off processing - to interrupt handler. - - - - - ATAPI PIO - - - ATA_PROT_ATAPI is in this category. ATA_NIEN bit is set - and, as in ATAPI NODATA or DMA, packet_task submits cdb. - However, after submitting cdb, further processing (data - transfer) is handed off to pio_task. - - - - - - - How commands are completed - - Once issued, all qc's are either completed with - ata_qc_complete() or time out. For commands which are handled - by interrupts, ata_host_intr() invokes ata_qc_complete(), and, - for PIO tasks, pio_task invokes ata_qc_complete(). In error - cases, packet_task may also complete commands. - - - ata_qc_complete() does the following. - - - - - - - DMA memory is unmapped. - - - - - - ATA_QCFLAG_ACTIVE is cleared from qc->flags. - - - - - - qc->complete_fn() callback is invoked. If the return value of - the callback is not zero. Completion is short circuited and - ata_qc_complete() returns. - - - - - - __ata_qc_complete() is called, which does - - - - - qc->flags is cleared to zero. - - - - - - ap->active_tag and qc->tag are poisoned. - - - - - - qc->waiting is cleared & completed (in that order). - - - - - - qc is deallocated by clearing appropriate bit in ap->qactive. - - - - - - - - - - - So, it basically notifies upper layer and deallocates qc. One - exception is short-circuit path in #3 which is used by - atapi_qc_complete(). - - - For all non-ATAPI commands, whether it fails or not, almost - the same code path is taken and very little error handling - takes place. A qc is completed with success status if it - succeeded, with failed status otherwise. - - - However, failed ATAPI commands require more handling as - REQUEST SENSE is needed to acquire sense data. If an ATAPI - command fails, ata_qc_complete() is invoked with error status, - which in turn invokes atapi_qc_complete() via - qc->complete_fn() callback. - - - This makes atapi_qc_complete() set scmd->result to - SAM_STAT_CHECK_CONDITION, complete the scmd and return 1. As - the sense data is empty but scmd->result is CHECK CONDITION, - SCSI midlayer will invoke EH for the scmd, and returning 1 - makes ata_qc_complete() to return without deallocating the qc. - This leads us to ata_scsi_error() with partially completed qc. - - - - - ata_scsi_error() - - ata_scsi_error() is the current transportt->eh_strategy_handler() - for libata. As discussed above, this will be entered in two - cases - timeout and ATAPI error completion. This function - calls low level libata driver's eng_timeout() callback, the - standard callback for which is ata_eng_timeout(). It checks - if a qc is active and calls ata_qc_timeout() on the qc if so. - Actual error handling occurs in ata_qc_timeout(). - - - If EH is invoked for timeout, ata_qc_timeout() stops BMDMA and - completes the qc. Note that as we're currently in EH, we - cannot call scsi_done. As described in SCSI EH doc, a - recovered scmd should be either retried with - scsi_queue_insert() or finished with scsi_finish_command(). - Here, we override qc->scsidone with scsi_finish_command() and - calls ata_qc_complete(). - - - If EH is invoked due to a failed ATAPI qc, the qc here is - completed but not deallocated. The purpose of this - half-completion is to use the qc as place holder to make EH - code reach this place. This is a bit hackish, but it works. - - - Once control reaches here, the qc is deallocated by invoking - __ata_qc_complete() explicitly. Then, internal qc for REQUEST - SENSE is issued. Once sense data is acquired, scmd is - finished by directly invoking scsi_finish_command() on the - scmd. Note that as we already have completed and deallocated - the qc which was associated with the scmd, we don't need - to/cannot call ata_qc_complete() again. - - - - - Problems with the current EH - - - - - - Error representation is too crude. Currently any and all - error conditions are represented with ATA STATUS and ERROR - registers. Errors which aren't ATA device errors are treated - as ATA device errors by setting ATA_ERR bit. Better error - descriptor which can properly represent ATA and other - errors/exceptions is needed. - - - - - - When handling timeouts, no action is taken to make device - forget about the timed out command and ready for new commands. - - - - - - EH handling via ata_scsi_error() is not properly protected - from usual command processing. On EH entrance, the device is - not in quiescent state. Timed out commands may succeed or - fail any time. pio_task and atapi_task may still be running. - - - - - - Too weak error recovery. Devices / controllers causing HSM - mismatch errors and other errors quite often require reset to - return to known state. Also, advanced error handling is - necessary to support features like NCQ and hotplug. - - - - - - ATA errors are directly handled in the interrupt handler and - PIO errors in pio_task. This is problematic for advanced - error handling for the following reasons. - - - First, advanced error handling often requires context and - internal qc execution. - - - Second, even a simple failure (say, CRC error) needs - information gathering and could trigger complex error handling - (say, resetting & reconfiguring). Having multiple code - paths to gather information, enter EH and trigger actions - makes life painful. - - - Third, scattered EH code makes implementing low level drivers - difficult. Low level drivers override libata callbacks. If - EH is scattered over several places, each affected callbacks - should perform its part of error handling. This can be error - prone and painful. - - - - - - - - - libata Library -!Edrivers/ata/libata-core.c - - - - libata Core Internals -!Idrivers/ata/libata-core.c - - - - libata SCSI translation/emulation -!Edrivers/ata/libata-scsi.c -!Idrivers/ata/libata-scsi.c - - - - ATA errors and exceptions - - - This chapter tries to identify what error/exception conditions exist - for ATA/ATAPI devices and describe how they should be handled in - implementation-neutral way. - - - - The term 'error' is used to describe conditions where either an - explicit error condition is reported from device or a command has - timed out. - - - - The term 'exception' is either used to describe exceptional - conditions which are not errors (say, power or hotplug events), or - to describe both errors and non-error exceptional conditions. Where - explicit distinction between error and exception is necessary, the - term 'non-error exception' is used. - - - - Exception categories - - Exceptions are described primarily with respect to legacy - taskfile + bus master IDE interface. If a controller provides - other better mechanism for error reporting, mapping those into - categories described below shouldn't be difficult. - - - - In the following sections, two recovery actions - reset and - reconfiguring transport - are mentioned. These are described - further in . - - - - HSM violation - - This error is indicated when STATUS value doesn't match HSM - requirement during issuing or execution any ATA/ATAPI command. - - - - Examples - - - - ATA_STATUS doesn't contain !BSY && DRDY && !DRQ while trying - to issue a command. - - - - - - !BSY && !DRQ during PIO data transfer. - - - - - - DRQ on command completion. - - - - - - !BSY && ERR after CDB transfer starts but before the - last byte of CDB is transferred. ATA/ATAPI standard states - that "The device shall not terminate the PACKET command - with an error before the last byte of the command packet has - been written" in the error outputs description of PACKET - command and the state diagram doesn't include such - transitions. - - - - - - - In these cases, HSM is violated and not much information - regarding the error can be acquired from STATUS or ERROR - register. IOW, this error can be anything - driver bug, - faulty device, controller and/or cable. - - - - As HSM is violated, reset is necessary to restore known state. - Reconfiguring transport for lower speed might be helpful too - as transmission errors sometimes cause this kind of errors. - - - - - ATA/ATAPI device error (non-NCQ / non-CHECK CONDITION) - - - These are errors detected and reported by ATA/ATAPI devices - indicating device problems. For this type of errors, STATUS - and ERROR register values are valid and describe error - condition. Note that some of ATA bus errors are detected by - ATA/ATAPI devices and reported using the same mechanism as - device errors. Those cases are described later in this - section. - - - - For ATA commands, this type of errors are indicated by !BSY - && ERR during command execution and on completion. - - - For ATAPI commands, - - - - - - !BSY && ERR && ABRT right after issuing PACKET - indicates that PACKET command is not supported and falls in - this category. - - - - - - !BSY && ERR(==CHK) && !ABRT after the last - byte of CDB is transferred indicates CHECK CONDITION and - doesn't fall in this category. - - - - - - !BSY && ERR(==CHK) && ABRT after the last byte - of CDB is transferred *probably* indicates CHECK CONDITION and - doesn't fall in this category. - - - - - - - Of errors detected as above, the following are not ATA/ATAPI - device errors but ATA bus errors and should be handled - according to . - - - - - - CRC error during data transfer - - - This is indicated by ICRC bit in the ERROR register and - means that corruption occurred during data transfer. Up to - ATA/ATAPI-7, the standard specifies that this bit is only - applicable to UDMA transfers but ATA/ATAPI-8 draft revision - 1f says that the bit may be applicable to multiword DMA and - PIO. - - - - - - ABRT error during data transfer or on completion - - - Up to ATA/ATAPI-7, the standard specifies that ABRT could be - set on ICRC errors and on cases where a device is not able - to complete a command. Combined with the fact that MWDMA - and PIO transfer errors aren't allowed to use ICRC bit up to - ATA/ATAPI-7, it seems to imply that ABRT bit alone could - indicate transfer errors. - - - However, ATA/ATAPI-8 draft revision 1f removes the part - that ICRC errors can turn on ABRT. So, this is kind of - gray area. Some heuristics are needed here. - - - - - - - - ATA/ATAPI device errors can be further categorized as follows. - - - - - - Media errors - - - This is indicated by UNC bit in the ERROR register. ATA - devices reports UNC error only after certain number of - retries cannot recover the data, so there's nothing much - else to do other than notifying upper layer. - - - READ and WRITE commands report CHS or LBA of the first - failed sector but ATA/ATAPI standard specifies that the - amount of transferred data on error completion is - indeterminate, so we cannot assume that sectors preceding - the failed sector have been transferred and thus cannot - complete those sectors successfully as SCSI does. - - - - - - Media changed / media change requested error - - - <<TODO: fill here>> - - - - - Address error - - - This is indicated by IDNF bit in the ERROR register. - Report to upper layer. - - - - - Other errors - - - This can be invalid command or parameter indicated by ABRT - ERROR bit or some other error condition. Note that ABRT - bit can indicate a lot of things including ICRC and Address - errors. Heuristics needed. - - - - - - - - Depending on commands, not all STATUS/ERROR bits are - applicable. These non-applicable bits are marked with - "na" in the output descriptions but up to ATA/ATAPI-7 - no definition of "na" can be found. However, - ATA/ATAPI-8 draft revision 1f describes "N/A" as - follows. - - -
- - 3.2.3.3a N/A - - - A keyword the indicates a field has no defined value in - this standard and should not be checked by the host or - device. N/A fields should be cleared to zero. - - - - -
- - - So, it seems reasonable to assume that "na" bits are - cleared to zero by devices and thus need no explicit masking. - - -
- - - ATAPI device CHECK CONDITION - - - ATAPI device CHECK CONDITION error is indicated by set CHK bit - (ERR bit) in the STATUS register after the last byte of CDB is - transferred for a PACKET command. For this kind of errors, - sense data should be acquired to gather information regarding - the errors. REQUEST SENSE packet command should be used to - acquire sense data. - - - - Once sense data is acquired, this type of errors can be - handled similarly to other SCSI errors. Note that sense data - may indicate ATA bus error (e.g. Sense Key 04h HARDWARE ERROR - && ASC/ASCQ 47h/00h SCSI PARITY ERROR). In such - cases, the error should be considered as an ATA bus error and - handled according to . - - - - - - ATA device error (NCQ) - - - NCQ command error is indicated by cleared BSY and set ERR bit - during NCQ command phase (one or more NCQ commands - outstanding). Although STATUS and ERROR registers will - contain valid values describing the error, READ LOG EXT is - required to clear the error condition, determine which command - has failed and acquire more information. - - - - READ LOG EXT Log Page 10h reports which tag has failed and - taskfile register values describing the error. With this - information the failed command can be handled as a normal ATA - command error as in and all - other in-flight commands must be retried. Note that this - retry should not be counted - it's likely that commands - retried this way would have completed normally if it were not - for the failed command. - - - - Note that ATA bus errors can be reported as ATA device NCQ - errors. This should be handled as described in . - - - - If READ LOG EXT Log Page 10h fails or reports NQ, we're - thoroughly screwed. This condition should be treated - according to . - - - - - - ATA bus error - - - ATA bus error means that data corruption occurred during - transmission over ATA bus (SATA or PATA). This type of errors - can be indicated by - - - - - - - ICRC or ABRT error as described in . - - - - - - Controller-specific error completion with error information - indicating transmission error. - - - - - - On some controllers, command timeout. In this case, there may - be a mechanism to determine that the timeout is due to - transmission error. - - - - - - Unknown/random errors, timeouts and all sorts of weirdities. - - - - - - - As described above, transmission errors can cause wide variety - of symptoms ranging from device ICRC error to random device - lockup, and, for many cases, there is no way to tell if an - error condition is due to transmission error or not; - therefore, it's necessary to employ some kind of heuristic - when dealing with errors and timeouts. For example, - encountering repetitive ABRT errors for known supported - command is likely to indicate ATA bus error. - - - - Once it's determined that ATA bus errors have possibly - occurred, lowering ATA bus transmission speed is one of - actions which may alleviate the problem. See for more information. - - - - - - PCI bus error - - - Data corruption or other failures during transmission over PCI - (or other system bus). For standard BMDMA, this is indicated - by Error bit in the BMDMA Status register. This type of - errors must be logged as it indicates something is very wrong - with the system. Resetting host controller is recommended. - - - - - - Late completion - - - This occurs when timeout occurs and the timeout handler finds - out that the timed out command has completed successfully or - with error. This is usually caused by lost interrupts. This - type of errors must be logged. Resetting host controller is - recommended. - - - - - - Unknown error (timeout) - - - This is when timeout occurs and the command is still - processing or the host and device are in unknown state. When - this occurs, HSM could be in any valid or invalid state. To - bring the device to known state and make it forget about the - timed out command, resetting is necessary. The timed out - command may be retried. - - - - Timeouts can also be caused by transmission errors. Refer to - for more details. - - - - - - Hotplug and power management exceptions - - - <<TODO: fill here>> - - - - -
- - - EH recovery actions - - - This section discusses several important recovery actions. - - - - Clearing error condition - - - Many controllers require its error registers to be cleared by - error handler. Different controllers may have different - requirements. - - - - For SATA, it's strongly recommended to clear at least SError - register during error handling. - - - - - Reset - - - During EH, resetting is necessary in the following cases. - - - - - - - HSM is in unknown or invalid state - - - - - - HBA is in unknown or invalid state - - - - - - EH needs to make HBA/device forget about in-flight commands - - - - - - HBA/device behaves weirdly - - - - - - - Resetting during EH might be a good idea regardless of error - condition to improve EH robustness. Whether to reset both or - either one of HBA and device depends on situation but the - following scheme is recommended. - - - - - - - When it's known that HBA is in ready state but ATA/ATAPI - device is in unknown state, reset only device. - - - - - - If HBA is in unknown state, reset both HBA and device. - - - - - - - HBA resetting is implementation specific. For a controller - complying to taskfile/BMDMA PCI IDE, stopping active DMA - transaction may be sufficient iff BMDMA state is the only HBA - context. But even mostly taskfile/BMDMA PCI IDE complying - controllers may have implementation specific requirements and - mechanism to reset themselves. This must be addressed by - specific drivers. - - - - OTOH, ATA/ATAPI standard describes in detail ways to reset - ATA/ATAPI devices. - - - - - PATA hardware reset - - - This is hardware initiated device reset signalled with - asserted PATA RESET- signal. There is no standard way to - initiate hardware reset from software although some - hardware provides registers that allow driver to directly - tweak the RESET- signal. - - - - - Software reset - - - This is achieved by turning CONTROL SRST bit on for at - least 5us. Both PATA and SATA support it but, in case of - SATA, this may require controller-specific support as the - second Register FIS to clear SRST should be transmitted - while BSY bit is still set. Note that on PATA, this resets - both master and slave devices on a channel. - - - - - EXECUTE DEVICE DIAGNOSTIC command - - - Although ATA/ATAPI standard doesn't describe exactly, EDD - implies some level of resetting, possibly similar level - with software reset. Host-side EDD protocol can be handled - with normal command processing and most SATA controllers - should be able to handle EDD's just like other commands. - As in software reset, EDD affects both devices on a PATA - bus. - - - Although EDD does reset devices, this doesn't suit error - handling as EDD cannot be issued while BSY is set and it's - unclear how it will act when device is in unknown/weird - state. - - - - - ATAPI DEVICE RESET command - - - This is very similar to software reset except that reset - can be restricted to the selected device without affecting - the other device sharing the cable. - - - - - SATA phy reset - - - This is the preferred way of resetting a SATA device. In - effect, it's identical to PATA hardware reset. Note that - this can be done with the standard SCR Control register. - As such, it's usually easier to implement than software - reset. - - - - - - - - One more thing to consider when resetting devices is that - resetting clears certain configuration parameters and they - need to be set to their previous or newly adjusted values - after reset. - - - - Parameters affected are. - - - - - - - CHS set up with INITIALIZE DEVICE PARAMETERS (seldom used) - - - - - - Parameters set with SET FEATURES including transfer mode setting - - - - - - Block count set with SET MULTIPLE MODE - - - - - - Other parameters (SET MAX, MEDIA LOCK...) - - - - - - - ATA/ATAPI standard specifies that some parameters must be - maintained across hardware or software reset, but doesn't - strictly specify all of them. Always reconfiguring needed - parameters after reset is required for robustness. Note that - this also applies when resuming from deep sleep (power-off). - - - - Also, ATA/ATAPI standard requires that IDENTIFY DEVICE / - IDENTIFY PACKET DEVICE is issued after any configuration - parameter is updated or a hardware reset and the result used - for further operation. OS driver is required to implement - revalidation mechanism to support this. - - - - - - Reconfigure transport - - - For both PATA and SATA, a lot of corners are cut for cheap - connectors, cables or controllers and it's quite common to see - high transmission error rate. This can be mitigated by - lowering transmission speed. - - - - The following is a possible scheme Jeff Garzik suggested. - - -
- - If more than $N (3?) transmission errors happen in 15 minutes, - - - - - if SATA, decrease SATA PHY speed. if speed cannot be decreased, - - - - - decrease UDMA xfer speed. if at UDMA0, switch to PIO4, - - - - - decrease PIO xfer speed. if at PIO3, complain, but continue - - - -
- -
- -
- -
- - - ata_piix Internals -!Idrivers/ata/ata_piix.c - - - - sata_sil Internals -!Idrivers/ata/sata_sil.c - - - - Thanks - - The bulk of the ATA knowledge comes thanks to long conversations with - Andre Hedrick (www.linux-ide.org), and long hours pondering the ATA - and SCSI specifications. - - - Thanks to Alan Cox for pointing out similarities - between SATA and SCSI, and in general for motivation to hack on - libata. - - - libata's device detection - method, ata_pio_devchk, and in general all the early probing was - based on extensive study of Hale Landis's probe/reset code in his - ATADRVR driver (www.ata-atapi.com). - - - -
diff --git a/Documentation/DocBook/librs.tmpl b/Documentation/DocBook/librs.tmpl deleted file mode 100644 index 94f21361e0ed..000000000000 --- a/Documentation/DocBook/librs.tmpl +++ /dev/null @@ -1,289 +0,0 @@ - - - - - - Reed-Solomon Library Programming Interface - - - - Thomas - Gleixner - -
- tglx@linutronix.de -
-
-
-
- - - 2004 - Thomas Gleixner - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - -
- - - - - Introduction - - The generic Reed-Solomon Library provides encoding, decoding - and error correction functions. - - - Reed-Solomon codes are used in communication and storage - applications to ensure data integrity. - - - This documentation is provided for developers who want to utilize - the functions provided by the library. - - - - - Known Bugs And Assumptions - - None. - - - - - Usage - - This chapter provides examples of how to use the library. - - - Initializing - - The init function init_rs returns a pointer to an - rs decoder structure, which holds the necessary - information for encoding, decoding and error correction - with the given polynomial. It either uses an existing - matching decoder or creates a new one. On creation all - the lookup tables for fast en/decoding are created. - The function may take a while, so make sure not to - call it in critical code paths. - - -/* the Reed Solomon control structure */ -static struct rs_control *rs_decoder; - -/* Symbolsize is 10 (bits) - * Primitive polynomial is x^10+x^3+1 - * first consecutive root is 0 - * primitive element to generate roots = 1 - * generator polynomial degree (number of roots) = 6 - */ -rs_decoder = init_rs (10, 0x409, 0, 1, 6); - - - - Encoding - - The encoder calculates the Reed-Solomon code over - the given data length and stores the result in - the parity buffer. Note that the parity buffer must - be initialized before calling the encoder. - - - The expanded data can be inverted on the fly by - providing a non-zero inversion mask. The expanded data is - XOR'ed with the mask. This is used e.g. for FLASH - ECC, where the all 0xFF is inverted to an all 0x00. - The Reed-Solomon code for all 0x00 is all 0x00. The - code is inverted before storing to FLASH so it is 0xFF - too. This prevents that reading from an erased FLASH - results in ECC errors. - - - The databytes are expanded to the given symbol size - on the fly. There is no support for encoding continuous - bitstreams with a symbol size != 8 at the moment. If - it is necessary it should be not a big deal to implement - such functionality. - - -/* Parity buffer. Size = number of roots */ -uint16_t par[6]; -/* Initialize the parity buffer */ -memset(par, 0, sizeof(par)); -/* Encode 512 byte in data8. Store parity in buffer par */ -encode_rs8 (rs_decoder, data8, 512, par, 0); - - - - Decoding - - The decoder calculates the syndrome over - the given data length and the received parity symbols - and corrects errors in the data. - - - If a syndrome is available from a hardware decoder - then the syndrome calculation is skipped. - - - The correction of the data buffer can be suppressed - by providing a correction pattern buffer and an error - location buffer to the decoder. The decoder stores the - calculated error location and the correction bitmask - in the given buffers. This is useful for hardware - decoders which use a weird bit ordering scheme. - - - The databytes are expanded to the given symbol size - on the fly. There is no support for decoding continuous - bitstreams with a symbolsize != 8 at the moment. If - it is necessary it should be not a big deal to implement - such functionality. - - - - - Decoding with syndrome calculation, direct data correction - - -/* Parity buffer. Size = number of roots */ -uint16_t par[6]; -uint8_t data[512]; -int numerr; -/* Receive data */ -..... -/* Receive parity */ -..... -/* Decode 512 byte in data8.*/ -numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL); - - - - - - Decoding with syndrome given by hardware decoder, direct data correction - - -/* Parity buffer. Size = number of roots */ -uint16_t par[6], syn[6]; -uint8_t data[512]; -int numerr; -/* Receive data */ -..... -/* Receive parity */ -..... -/* Get syndrome from hardware decoder */ -..... -/* Decode 512 byte in data8.*/ -numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL); - - - - - - Decoding with syndrome given by hardware decoder, no direct data correction. - - - Note: It's not necessary to give data and received parity to the decoder. - - -/* Parity buffer. Size = number of roots */ -uint16_t par[6], syn[6], corr[8]; -uint8_t data[512]; -int numerr, errpos[8]; -/* Receive data */ -..... -/* Receive parity */ -..... -/* Get syndrome from hardware decoder */ -..... -/* Decode 512 byte in data8.*/ -numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr); -for (i = 0; i < numerr; i++) { - do_error_correction_in_your_buffer(errpos[i], corr[i]); -} - - - - - Cleanup - - The function free_rs frees the allocated resources, - if the caller is the last user of the decoder. - - -/* Release resources */ -free_rs(rs_decoder); - - - - - - - Structures - - This chapter contains the autogenerated documentation of the structures which are - used in the Reed-Solomon Library and are relevant for a developer. - -!Iinclude/linux/rslib.h - - - - Public Functions Provided - - This chapter contains the autogenerated documentation of the Reed-Solomon functions - which are exported. - -!Elib/reed_solomon/reed_solomon.c - - - - Credits - - The library code for encoding and decoding was written by Phil Karn. - - - Copyright 2002, Phil Karn, KA9Q - May be used under the terms of the GNU General Public License (GPL) - - - The wrapper functions and interfaces are written by Thomas Gleixner. - - - Many users have provided bugfixes, improvements and helping hands for testing. - Thanks a lot. - - - The following people have contributed to this document: - - - Thomas Gleixnertglx@linutronix.de - - -
diff --git a/Documentation/DocBook/lsm.tmpl b/Documentation/DocBook/lsm.tmpl deleted file mode 100644 index fe7664ce9667..000000000000 --- a/Documentation/DocBook/lsm.tmpl +++ /dev/null @@ -1,265 +0,0 @@ - - - -
- - Linux Security Modules: General Security Hooks for Linux - - - Stephen - Smalley - - NAI Labs -
ssmalley@nai.com
-
-
- - Timothy - Fraser - - NAI Labs -
tfraser@nai.com
-
-
- - Chris - Vance - - NAI Labs -
cvance@nai.com
-
-
-
-
- -Introduction - - -In March 2001, the National Security Agency (NSA) gave a presentation -about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel -Summit. SELinux is an implementation of flexible and fine-grained -nondiscretionary access controls in the Linux kernel, originally -implemented as its own particular kernel patch. Several other -security projects (e.g. RSBAC, Medusa) have also developed flexible -access control architectures for the Linux kernel, and various -projects have developed particular access control models for Linux -(e.g. LIDS, DTE, SubDomain). Each project has developed and -maintained its own kernel patch to support its security needs. - - - -In response to the NSA presentation, Linus Torvalds made a set of -remarks that described a security framework he would be willing to -consider for inclusion in the mainstream Linux kernel. He described a -general framework that would provide a set of security hooks to -control operations on kernel objects and a set of opaque security -fields in kernel data structures for maintaining security attributes. -This framework could then be used by loadable kernel modules to -implement any desired model of security. Linus also suggested the -possibility of migrating the Linux capabilities code into such a -module. - - - -The Linux Security Modules (LSM) project was started by WireX to -develop such a framework. LSM is a joint development effort by -several security projects, including Immunix, SELinux, SGI and Janus, -and several individuals, including Greg Kroah-Hartman and James -Morris, to develop a Linux kernel patch that implements this -framework. The patch is currently tracking the 2.4 series and is -targeted for integration into the 2.5 development series. This -technical report provides an overview of the framework and the example -capabilities security module provided by the LSM kernel patch. - - - - -LSM Framework - - -The LSM kernel patch provides a general kernel framework to support -security modules. In particular, the LSM framework is primarily -focused on supporting access control modules, although future -development is likely to address other security needs such as -auditing. By itself, the framework does not provide any additional -security; it merely provides the infrastructure to support security -modules. The LSM kernel patch also moves most of the capabilities -logic into an optional security module, with the system defaulting -to the traditional superuser logic. This capabilities module -is discussed further in . - - - -The LSM kernel patch adds security fields to kernel data structures -and inserts calls to hook functions at critical points in the kernel -code to manage the security fields and to perform access control. It -also adds functions for registering and unregistering security -modules, and adds a general security system call -to support new system calls for security-aware applications. - - - -The LSM security fields are simply void* pointers. For -process and program execution security information, security fields -were added to struct task_struct and -struct linux_binprm. For filesystem security -information, a security field was added to -struct super_block. For pipe, file, and socket -security information, security fields were added to -struct inode and -struct file. For packet and network device security -information, security fields were added to -struct sk_buff and -struct net_device. For System V IPC security -information, security fields were added to -struct kern_ipc_perm and -struct msg_msg; additionally, the definitions -for struct msg_msg, struct -msg_queue, and struct -shmid_kernel were moved to header files -(include/linux/msg.h and -include/linux/shm.h as appropriate) to allow -the security modules to use these definitions. - - - -Each LSM hook is a function pointer in a global table, -security_ops. This table is a -security_operations structure as defined by -include/linux/security.h. Detailed documentation -for each hook is included in this header file. At present, this -structure consists of a collection of substructures that group related -hooks based on the kernel object (e.g. task, inode, file, sk_buff, -etc) as well as some top-level hook function pointers for system -operations. This structure is likely to be flattened in the future -for performance. The placement of the hook calls in the kernel code -is described by the "called:" lines in the per-hook documentation in -the header file. The hook calls can also be easily found in the -kernel code by looking for the string "security_ops->". - - - - -Linus mentioned per-process security hooks in his original remarks as a -possible alternative to global security hooks. However, if LSM were -to start from the perspective of per-process hooks, then the base -framework would have to deal with how to handle operations that -involve multiple processes (e.g. kill), since each process might have -its own hook for controlling the operation. This would require a -general mechanism for composing hooks in the base framework. -Additionally, LSM would still need global hooks for operations that -have no process context (e.g. network input operations). -Consequently, LSM provides global security hooks, but a security -module is free to implement per-process hooks (where that makes sense) -by storing a security_ops table in each process' security field and -then invoking these per-process hooks from the global hooks. -The problem of composition is thus deferred to the module. - - - -The global security_ops table is initialized to a set of hook -functions provided by a dummy security module that provides -traditional superuser logic. A register_security -function (in security/security.c) is provided to -allow a security module to set security_ops to refer to its own hook -functions, and an unregister_security function is -provided to revert security_ops to the dummy module hooks. This -mechanism is used to set the primary security module, which is -responsible for making the final decision for each hook. - - - -LSM also provides a simple mechanism for stacking additional security -modules with the primary security module. It defines -register_security and -unregister_security hooks in the -security_operations structure and provides -mod_reg_security and -mod_unreg_security functions that invoke these -hooks after performing some sanity checking. A security module can -call these functions in order to stack with other modules. However, -the actual details of how this stacking is handled are deferred to the -module, which can implement these hooks in any way it wishes -(including always returning an error if it does not wish to support -stacking). In this manner, LSM again defers the problem of -composition to the module. - - - -Although the LSM hooks are organized into substructures based on -kernel object, all of the hooks can be viewed as falling into two -major categories: hooks that are used to manage the security fields -and hooks that are used to perform access control. Examples of the -first category of hooks include the -alloc_security and -free_security hooks defined for each kernel data -structure that has a security field. These hooks are used to allocate -and free security structures for kernel objects. The first category -of hooks also includes hooks that set information in the security -field after allocation, such as the post_lookup -hook in struct inode_security_ops. This hook -is used to set security information for inodes after successful lookup -operations. An example of the second category of hooks is the -permission hook in -struct inode_security_ops. This hook checks -permission when accessing an inode. - - - - -LSM Capabilities Module - - -The LSM kernel patch moves most of the existing POSIX.1e capabilities -logic into an optional security module stored in the file -security/capability.c. This change allows -users who do not want to use capabilities to omit this code entirely -from their kernel, instead using the dummy module for traditional -superuser logic or any other module that they desire. This change -also allows the developers of the capabilities logic to maintain and -enhance their code more freely, without needing to integrate patches -back into the base kernel. - - - -In addition to moving the capabilities logic, the LSM kernel patch -could move the capability-related fields from the kernel data -structures into the new security fields managed by the security -modules. However, at present, the LSM kernel patch leaves the -capability fields in the kernel data structures. In his original -remarks, Linus suggested that this might be preferable so that other -security modules can be easily stacked with the capabilities module -without needing to chain multiple security structures on the security field. -It also avoids imposing extra overhead on the capabilities module -to manage the security fields. However, the LSM framework could -certainly support such a move if it is determined to be desirable, -with only a few additional changes described below. - - - -At present, the capabilities logic for computing process capabilities -on execve and set*uid, -checking capabilities for a particular process, saving and checking -capabilities for netlink messages, and handling the -capget and capset system -calls have been moved into the capabilities module. There are still a -few locations in the base kernel where capability-related fields are -directly examined or modified, but the current version of the LSM -patch does allow a security module to completely replace the -assignment and testing of capabilities. These few locations would -need to be changed if the capability-related fields were moved into -the security field. The following is a list of known locations that -still perform such direct examination or modification of -capability-related fields: - -fs/open.c:sys_access -fs/lockd/host.c:nlm_bind_host -fs/nfsd/auth.c:nfsd_setuser -fs/proc/array.c:task_cap - - - - - -
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl deleted file mode 100644 index b442921bca54..000000000000 --- a/Documentation/DocBook/mtdnand.tmpl +++ /dev/null @@ -1,1291 +0,0 @@ - - - - - - MTD NAND Driver Programming Interface - - - - Thomas - Gleixner - -
- tglx@linutronix.de -
-
-
-
- - - 2004 - Thomas Gleixner - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - -
- - - - - Introduction - - The generic NAND driver supports almost all NAND and AG-AND based - chips and connects them to the Memory Technology Devices (MTD) - subsystem of the Linux Kernel. - - - This documentation is provided for developers who want to implement - board drivers or filesystem drivers suitable for NAND devices. - - - - - Known Bugs And Assumptions - - None. - - - - - Documentation hints - - The function and structure docs are autogenerated. Each function and - struct member has a short description which is marked with an [XXX] identifier. - The following chapters explain the meaning of those identifiers. - - - Function identifiers [XXX] - - The functions are marked with [XXX] identifiers in the short - comment. The identifiers explain the usage and scope of the - functions. Following identifiers are used: - - - - [MTD Interface] - These functions provide the interface to the MTD kernel API. - They are not replaceable and provide functionality - which is complete hardware independent. - - - [NAND Interface] - These functions are exported and provide the interface to the NAND kernel API. - - - [GENERIC] - Generic functions are not replaceable and provide functionality - which is complete hardware independent. - - - [DEFAULT] - Default functions provide hardware related functionality which is suitable - for most of the implementations. These functions can be replaced by the - board driver if necessary. Those functions are called via pointers in the - NAND chip description structure. The board driver can set the functions which - should be replaced by board dependent functions before calling nand_scan(). - If the function pointer is NULL on entry to nand_scan() then the pointer - is set to the default function which is suitable for the detected chip type. - - - - - Struct member identifiers [XXX] - - The struct members are marked with [XXX] identifiers in the - comment. The identifiers explain the usage and scope of the - members. Following identifiers are used: - - - - [INTERN] - These members are for NAND driver internal use only and must not be - modified. Most of these values are calculated from the chip geometry - information which is evaluated during nand_scan(). - - - [REPLACEABLE] - Replaceable members hold hardware related functions which can be - provided by the board driver. The board driver can set the functions which - should be replaced by board dependent functions before calling nand_scan(). - If the function pointer is NULL on entry to nand_scan() then the pointer - is set to the default function which is suitable for the detected chip type. - - - [BOARDSPECIFIC] - Board specific members hold hardware related information which must - be provided by the board driver. The board driver must set the function - pointers and datafields before calling nand_scan(). - - - [OPTIONAL] - Optional members can hold information relevant for the board driver. The - generic NAND driver code does not use this information. - - - - - - - Basic board driver - - For most boards it will be sufficient to provide just the - basic functions and fill out some really board dependent - members in the nand chip description structure. - - - Basic defines - - At least you have to provide a nand_chip structure - and a storage for the ioremap'ed chip address. - You can allocate the nand_chip structure using - kmalloc or you can allocate it statically. - The NAND chip structure embeds an mtd structure - which will be registered to the MTD subsystem. - You can extract a pointer to the mtd structure - from a nand_chip pointer using the nand_to_mtd() - helper. - - - Kmalloc based example - - -static struct mtd_info *board_mtd; -static void __iomem *baseaddr; - - - Static example - - -static struct nand_chip board_chip; -static void __iomem *baseaddr; - - - - Partition defines - - If you want to divide your device into partitions, then - define a partitioning scheme suitable to your board. - - -#define NUM_PARTITIONS 2 -static struct mtd_partition partition_info[] = { - { .name = "Flash partition 1", - .offset = 0, - .size = 8 * 1024 * 1024 }, - { .name = "Flash partition 2", - .offset = MTDPART_OFS_NEXT, - .size = MTDPART_SIZ_FULL }, -}; - - - - Hardware control function - - The hardware control function provides access to the - control pins of the NAND chip(s). - The access can be done by GPIO pins or by address lines. - If you use address lines, make sure that the timing - requirements are met. - - - GPIO based example - - -static void board_hwcontrol(struct mtd_info *mtd, int cmd) -{ - switch(cmd){ - case NAND_CTL_SETCLE: /* Set CLE pin high */ break; - case NAND_CTL_CLRCLE: /* Set CLE pin low */ break; - case NAND_CTL_SETALE: /* Set ALE pin high */ break; - case NAND_CTL_CLRALE: /* Set ALE pin low */ break; - case NAND_CTL_SETNCE: /* Set nCE pin low */ break; - case NAND_CTL_CLRNCE: /* Set nCE pin high */ break; - } -} - - - Address lines based example. It's assumed that the - nCE pin is driven by a chip select decoder. - - -static void board_hwcontrol(struct mtd_info *mtd, int cmd) -{ - struct nand_chip *this = mtd_to_nand(mtd); - switch(cmd){ - case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break; - case NAND_CTL_CLRCLE: this->IO_ADDR_W &= ~CLE_ADRR_BIT; break; - case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT; break; - case NAND_CTL_CLRALE: this->IO_ADDR_W &= ~ALE_ADRR_BIT; break; - } -} - - - - Device ready function - - If the hardware interface has the ready busy pin of the NAND chip connected to a - GPIO or other accessible I/O pin, this function is used to read back the state of the - pin. The function has no arguments and should return 0, if the device is busy (R/B pin - is low) and 1, if the device is ready (R/B pin is high). - If the hardware interface does not give access to the ready busy pin, then - the function must not be defined and the function pointer this->dev_ready is set to NULL. - - - - Init function - - The init function allocates memory and sets up all the board - specific parameters and function pointers. When everything - is set up nand_scan() is called. This function tries to - detect and identify then chip. If a chip is found all the - internal data fields are initialized accordingly. - The structure(s) have to be zeroed out first and then filled with the necessary - information about the device. - - -static int __init board_init (void) -{ - struct nand_chip *this; - int err = 0; - - /* Allocate memory for MTD device structure and private data */ - this = kzalloc(sizeof(struct nand_chip), GFP_KERNEL); - if (!this) { - printk ("Unable to allocate NAND MTD device structure.\n"); - err = -ENOMEM; - goto out; - } - - board_mtd = nand_to_mtd(this); - - /* map physical address */ - baseaddr = ioremap(CHIP_PHYSICAL_ADDRESS, 1024); - if (!baseaddr) { - printk("Ioremap to access NAND chip failed\n"); - err = -EIO; - goto out_mtd; - } - - /* Set address of NAND IO lines */ - this->IO_ADDR_R = baseaddr; - this->IO_ADDR_W = baseaddr; - /* Reference hardware control function */ - this->hwcontrol = board_hwcontrol; - /* Set command delay time, see datasheet for correct value */ - this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY; - /* Assign the device ready function, if available */ - this->dev_ready = board_dev_ready; - this->eccmode = NAND_ECC_SOFT; - - /* Scan to find existence of the device */ - if (nand_scan (board_mtd, 1)) { - err = -ENXIO; - goto out_ior; - } - - add_mtd_partitions(board_mtd, partition_info, NUM_PARTITIONS); - goto out; - -out_ior: - iounmap(baseaddr); -out_mtd: - kfree (this); -out: - return err; -} -module_init(board_init); - - - - Exit function - - The exit function is only necessary if the driver is - compiled as a module. It releases all resources which - are held by the chip driver and unregisters the partitions - in the MTD layer. - - -#ifdef MODULE -static void __exit board_cleanup (void) -{ - /* Release resources, unregister device */ - nand_release (board_mtd); - - /* unmap physical address */ - iounmap(baseaddr); - - /* Free the MTD device structure */ - kfree (mtd_to_nand(board_mtd)); -} -module_exit(board_cleanup); -#endif - - - - - - Advanced board driver functions - - This chapter describes the advanced functionality of the NAND - driver. For a list of functions which can be overridden by the board - driver see the documentation of the nand_chip structure. - - - Multiple chip control - - The nand driver can control chip arrays. Therefore the - board driver must provide an own select_chip function. This - function must (de)select the requested chip. - The function pointer in the nand_chip structure must - be set before calling nand_scan(). The maxchip parameter - of nand_scan() defines the maximum number of chips to - scan for. Make sure that the select_chip function can - handle the requested number of chips. - - - The nand driver concatenates the chips to one virtual - chip and provides this virtual chip to the MTD layer. - - - Note: The driver can only handle linear chip arrays - of equally sized chips. There is no support for - parallel arrays which extend the buswidth. - - - GPIO based example - - -static void board_select_chip (struct mtd_info *mtd, int chip) -{ - /* Deselect all chips, set all nCE pins high */ - GPIO(BOARD_NAND_NCE) |= 0xff; - if (chip >= 0) - GPIO(BOARD_NAND_NCE) &= ~ (1 << chip); -} - - - Address lines based example. - Its assumed that the nCE pins are connected to an - address decoder. - - -static void board_select_chip (struct mtd_info *mtd, int chip) -{ - struct nand_chip *this = mtd_to_nand(mtd); - - /* Deselect all chips */ - this->IO_ADDR_R &= ~BOARD_NAND_ADDR_MASK; - this->IO_ADDR_W &= ~BOARD_NAND_ADDR_MASK; - switch (chip) { - case 0: - this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0; - this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0; - break; - .... - case n: - this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn; - this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn; - break; - } -} - - - - Hardware ECC support - - Functions and constants - - The nand driver supports three different types of - hardware ECC. - - NAND_ECC_HW3_256 - Hardware ECC generator providing 3 bytes ECC per - 256 byte. - - NAND_ECC_HW3_512 - Hardware ECC generator providing 3 bytes ECC per - 512 byte. - - NAND_ECC_HW6_512 - Hardware ECC generator providing 6 bytes ECC per - 512 byte. - - NAND_ECC_HW8_512 - Hardware ECC generator providing 6 bytes ECC per - 512 byte. - - - If your hardware generator has a different functionality - add it at the appropriate place in nand_base.c - - - The board driver must provide following functions: - - enable_hwecc - This function is called before reading / writing to - the chip. Reset or initialize the hardware generator - in this function. The function is called with an - argument which let you distinguish between read - and write operations. - - calculate_ecc - This function is called after read / write from / to - the chip. Transfer the ECC from the hardware to - the buffer. If the option NAND_HWECC_SYNDROME is set - then the function is only called on write. See below. - - correct_data - In case of an ECC error this function is called for - error detection and correction. Return 1 respectively 2 - in case the error can be corrected. If the error is - not correctable return -1. If your hardware generator - matches the default algorithm of the nand_ecc software - generator then use the correction function provided - by nand_ecc instead of implementing duplicated code. - - - - - - Hardware ECC with syndrome calculation - - Many hardware ECC implementations provide Reed-Solomon - codes and calculate an error syndrome on read. The syndrome - must be converted to a standard Reed-Solomon syndrome - before calling the error correction code in the generic - Reed-Solomon library. - - - The ECC bytes must be placed immediately after the data - bytes in order to make the syndrome generator work. This - is contrary to the usual layout used by software ECC. The - separation of data and out of band area is not longer - possible. The nand driver code handles this layout and - the remaining free bytes in the oob area are managed by - the autoplacement code. Provide a matching oob-layout - in this case. See rts_from4.c and diskonchip.c for - implementation reference. In those cases we must also - use bad block tables on FLASH, because the ECC layout is - interfering with the bad block marker positions. - See bad block table support for details. - - - - - Bad block table support - - Most NAND chips mark the bad blocks at a defined - position in the spare area. Those blocks must - not be erased under any circumstances as the bad - block information would be lost. - It is possible to check the bad block mark each - time when the blocks are accessed by reading the - spare area of the first page in the block. This - is time consuming so a bad block table is used. - - - The nand driver supports various types of bad block - tables. - - Per device - The bad block table contains all bad block information - of the device which can consist of multiple chips. - - Per chip - A bad block table is used per chip and contains the - bad block information for this particular chip. - - Fixed offset - The bad block table is located at a fixed offset - in the chip (device). This applies to various - DiskOnChip devices. - - Automatic placed - The bad block table is automatically placed and - detected either at the end or at the beginning - of a chip (device) - - Mirrored tables - The bad block table is mirrored on the chip (device) to - allow updates of the bad block table without data loss. - - - - - nand_scan() calls the function nand_default_bbt(). - nand_default_bbt() selects appropriate default - bad block table descriptors depending on the chip information - which was retrieved by nand_scan(). - - - The standard policy is scanning the device for bad - blocks and build a ram based bad block table which - allows faster access than always checking the - bad block information on the flash chip itself. - - - Flash based tables - - It may be desired or necessary to keep a bad block table in FLASH. - For AG-AND chips this is mandatory, as they have no factory marked - bad blocks. They have factory marked good blocks. The marker pattern - is erased when the block is erased to be reused. So in case of - powerloss before writing the pattern back to the chip this block - would be lost and added to the bad blocks. Therefore we scan the - chip(s) when we detect them the first time for good blocks and - store this information in a bad block table before erasing any - of the blocks. - - - The blocks in which the tables are stored are protected against - accidental access by marking them bad in the memory bad block - table. The bad block table management functions are allowed - to circumvent this protection. - - - The simplest way to activate the FLASH based bad block table support - is to set the option NAND_BBT_USE_FLASH in the bbt_option field of - the nand chip structure before calling nand_scan(). For AG-AND - chips is this done by default. - This activates the default FLASH based bad block table functionality - of the NAND driver. The default bad block table options are - - Store bad block table per chip - Use 2 bits per block - Automatic placement at the end of the chip - Use mirrored tables with version numbers - Reserve 4 blocks at the end of the chip - - - - - User defined tables - - User defined tables are created by filling out a - nand_bbt_descr structure and storing the pointer in the - nand_chip structure member bbt_td before calling nand_scan(). - If a mirror table is necessary a second structure must be - created and a pointer to this structure must be stored - in bbt_md inside the nand_chip structure. If the bbt_md - member is set to NULL then only the main table is used - and no scan for the mirrored table is performed. - - - The most important field in the nand_bbt_descr structure - is the options field. The options define most of the - table properties. Use the predefined constants from - nand.h to define the options. - - Number of bits per block - The supported number of bits is 1, 2, 4, 8. - Table per chip - Setting the constant NAND_BBT_PERCHIP selects that - a bad block table is managed for each chip in a chip array. - If this option is not set then a per device bad block table - is used. - Table location is absolute - Use the option constant NAND_BBT_ABSPAGE and - define the absolute page number where the bad block - table starts in the field pages. If you have selected bad block - tables per chip and you have a multi chip array then the start page - must be given for each chip in the chip array. Note: there is no scan - for a table ident pattern performed, so the fields - pattern, veroffs, offs, len can be left uninitialized - Table location is automatically detected - The table can either be located in the first or the last good - blocks of the chip (device). Set NAND_BBT_LASTBLOCK to place - the bad block table at the end of the chip (device). The - bad block tables are marked and identified by a pattern which - is stored in the spare area of the first page in the block which - holds the bad block table. Store a pointer to the pattern - in the pattern field. Further the length of the pattern has to be - stored in len and the offset in the spare area must be given - in the offs member of the nand_bbt_descr structure. For mirrored - bad block tables different patterns are mandatory. - Table creation - Set the option NAND_BBT_CREATE to enable the table creation - if no table can be found during the scan. Usually this is done only - once if a new chip is found. - Table write support - Set the option NAND_BBT_WRITE to enable the table write support. - This allows the update of the bad block table(s) in case a block has - to be marked bad due to wear. The MTD interface function block_markbad - is calling the update function of the bad block table. If the write - support is enabled then the table is updated on FLASH. - - Note: Write support should only be enabled for mirrored tables with - version control. - - Table version control - Set the option NAND_BBT_VERSION to enable the table version control. - It's highly recommended to enable this for mirrored tables with write - support. It makes sure that the risk of losing the bad block - table information is reduced to the loss of the information about the - one worn out block which should be marked bad. The version is stored in - 4 consecutive bytes in the spare area of the device. The position of - the version number is defined by the member veroffs in the bad block table - descriptor. - Save block contents on write - - In case that the block which holds the bad block table does contain - other useful information, set the option NAND_BBT_SAVECONTENT. When - the bad block table is written then the whole block is read the bad - block table is updated and the block is erased and everything is - written back. If this option is not set only the bad block table - is written and everything else in the block is ignored and erased. - - Number of reserved blocks - - For automatic placement some blocks must be reserved for - bad block table storage. The number of reserved blocks is defined - in the maxblocks member of the bad block table description structure. - Reserving 4 blocks for mirrored tables should be a reasonable number. - This also limits the number of blocks which are scanned for the bad - block table ident pattern. - - - - - - - Spare area (auto)placement - - The nand driver implements different possibilities for - placement of filesystem data in the spare area, - - Placement defined by fs driver - Automatic placement - - The default placement function is automatic placement. The - nand driver has built in default placement schemes for the - various chiptypes. If due to hardware ECC functionality the - default placement does not fit then the board driver can - provide a own placement scheme. - - - File system drivers can provide a own placement scheme which - is used instead of the default placement scheme. - - - Placement schemes are defined by a nand_oobinfo structure - -struct nand_oobinfo { - int useecc; - int eccbytes; - int eccpos[24]; - int oobfree[8][2]; -}; - - - useecc - The useecc member controls the ecc and placement function. The header - file include/mtd/mtd-abi.h contains constants to select ecc and - placement. MTD_NANDECC_OFF switches off the ecc complete. This is - not recommended and available for testing and diagnosis only. - MTD_NANDECC_PLACE selects caller defined placement, MTD_NANDECC_AUTOPLACE - selects automatic placement. - - eccbytes - The eccbytes member defines the number of ecc bytes per page. - - eccpos - The eccpos array holds the byte offsets in the spare area where - the ecc codes are placed. - - oobfree - The oobfree array defines the areas in the spare area which can be - used for automatic placement. The information is given in the format - {offset, size}. offset defines the start of the usable area, size the - length in bytes. More than one area can be defined. The list is terminated - by an {0, 0} entry. - - - - - Placement defined by fs driver - - The calling function provides a pointer to a nand_oobinfo - structure which defines the ecc placement. For writes the - caller must provide a spare area buffer along with the - data buffer. The spare area buffer size is (number of pages) * - (size of spare area). For reads the buffer size is - (number of pages) * ((size of spare area) + (number of ecc - steps per page) * sizeof (int)). The driver stores the - result of the ecc check for each tuple in the spare buffer. - The storage sequence is - - - <spare data page 0><ecc result 0>...<ecc result n> - - - ... - - - <spare data page n><ecc result 0>...<ecc result n> - - - This is a legacy mode used by YAFFS1. - - - If the spare area buffer is NULL then only the ECC placement is - done according to the given scheme in the nand_oobinfo structure. - - - - Automatic placement - - Automatic placement uses the built in defaults to place the - ecc bytes in the spare area. If filesystem data have to be stored / - read into the spare area then the calling function must provide a - buffer. The buffer size per page is determined by the oobfree array in - the nand_oobinfo structure. - - - If the spare area buffer is NULL then only the ECC placement is - done according to the default builtin scheme. - - - - - Spare area autoplacement default schemes - - 256 byte pagesize - - -Offset -Content -Comment - - -0x00 -ECC byte 0 -Error correction code byte 0 - - -0x01 -ECC byte 1 -Error correction code byte 1 - - -0x02 -ECC byte 2 -Error correction code byte 2 - - -0x03 -Autoplace 0 - - - -0x04 -Autoplace 1 - - - -0x05 -Bad block marker -If any bit in this byte is zero, then this block is bad. -This applies only to the first page in a block. In the remaining -pages this byte is reserved - - -0x06 -Autoplace 2 - - - -0x07 -Autoplace 3 - - - - - - 512 byte pagesize - - -Offset -Content -Comment - - -0x00 -ECC byte 0 -Error correction code byte 0 of the lower 256 Byte data in -this page - - -0x01 -ECC byte 1 -Error correction code byte 1 of the lower 256 Bytes of data -in this page - - -0x02 -ECC byte 2 -Error correction code byte 2 of the lower 256 Bytes of data -in this page - - -0x03 -ECC byte 3 -Error correction code byte 0 of the upper 256 Bytes of data -in this page - - -0x04 -reserved -reserved - - -0x05 -Bad block marker -If any bit in this byte is zero, then this block is bad. -This applies only to the first page in a block. In the remaining -pages this byte is reserved - - -0x06 -ECC byte 4 -Error correction code byte 1 of the upper 256 Bytes of data -in this page - - -0x07 -ECC byte 5 -Error correction code byte 2 of the upper 256 Bytes of data -in this page - - -0x08 - 0x0F -Autoplace 0 - 7 - - - - - - 2048 byte pagesize - - -Offset -Content -Comment - - -0x00 -Bad block marker -If any bit in this byte is zero, then this block is bad. -This applies only to the first page in a block. In the remaining -pages this byte is reserved - - -0x01 -Reserved -Reserved - - -0x02-0x27 -Autoplace 0 - 37 - - - -0x28 -ECC byte 0 -Error correction code byte 0 of the first 256 Byte data in -this page - - -0x29 -ECC byte 1 -Error correction code byte 1 of the first 256 Bytes of data -in this page - - -0x2A -ECC byte 2 -Error correction code byte 2 of the first 256 Bytes data in -this page - - -0x2B -ECC byte 3 -Error correction code byte 0 of the second 256 Bytes of data -in this page - - -0x2C -ECC byte 4 -Error correction code byte 1 of the second 256 Bytes of data -in this page - - -0x2D -ECC byte 5 -Error correction code byte 2 of the second 256 Bytes of data -in this page - - -0x2E -ECC byte 6 -Error correction code byte 0 of the third 256 Bytes of data -in this page - - -0x2F -ECC byte 7 -Error correction code byte 1 of the third 256 Bytes of data -in this page - - -0x30 -ECC byte 8 -Error correction code byte 2 of the third 256 Bytes of data -in this page - - -0x31 -ECC byte 9 -Error correction code byte 0 of the fourth 256 Bytes of data -in this page - - -0x32 -ECC byte 10 -Error correction code byte 1 of the fourth 256 Bytes of data -in this page - - -0x33 -ECC byte 11 -Error correction code byte 2 of the fourth 256 Bytes of data -in this page - - -0x34 -ECC byte 12 -Error correction code byte 0 of the fifth 256 Bytes of data -in this page - - -0x35 -ECC byte 13 -Error correction code byte 1 of the fifth 256 Bytes of data -in this page - - -0x36 -ECC byte 14 -Error correction code byte 2 of the fifth 256 Bytes of data -in this page - - -0x37 -ECC byte 15 -Error correction code byte 0 of the sixt 256 Bytes of data -in this page - - -0x38 -ECC byte 16 -Error correction code byte 1 of the sixt 256 Bytes of data -in this page - - -0x39 -ECC byte 17 -Error correction code byte 2 of the sixt 256 Bytes of data -in this page - - -0x3A -ECC byte 18 -Error correction code byte 0 of the seventh 256 Bytes of -data in this page - - -0x3B -ECC byte 19 -Error correction code byte 1 of the seventh 256 Bytes of -data in this page - - -0x3C -ECC byte 20 -Error correction code byte 2 of the seventh 256 Bytes of -data in this page - - -0x3D -ECC byte 21 -Error correction code byte 0 of the eighth 256 Bytes of data -in this page - - -0x3E -ECC byte 22 -Error correction code byte 1 of the eighth 256 Bytes of data -in this page - - -0x3F -ECC byte 23 -Error correction code byte 2 of the eighth 256 Bytes of data -in this page - - - - - - - - Filesystem support - - The NAND driver provides all necessary functions for a - filesystem via the MTD interface. - - - Filesystems must be aware of the NAND peculiarities and - restrictions. One major restrictions of NAND Flash is, that you cannot - write as often as you want to a page. The consecutive writes to a page, - before erasing it again, are restricted to 1-3 writes, depending on the - manufacturers specifications. This applies similar to the spare area. - - - Therefore NAND aware filesystems must either write in page size chunks - or hold a writebuffer to collect smaller writes until they sum up to - pagesize. Available NAND aware filesystems: JFFS2, YAFFS. - - - The spare area usage to store filesystem data is controlled by - the spare area placement functionality which is described in one - of the earlier chapters. - - - - Tools - - The MTD project provides a couple of helpful tools to handle NAND Flash. - - flasherase, flasheraseall: Erase and format FLASH partitions - nandwrite: write filesystem images to NAND FLASH - nanddump: dump the contents of a NAND FLASH partitions - - - - These tools are aware of the NAND restrictions. Please use those tools - instead of complaining about errors which are caused by non NAND aware - access methods. - - - - - Constants - - This chapter describes the constants which might be relevant for a driver developer. - - - Chip option constants - - Constants for chip id table - - These constants are defined in nand.h. They are ored together to describe - the chip functionality. - -/* Buswitdh is 16 bit */ -#define NAND_BUSWIDTH_16 0x00000002 -/* Device supports partial programming without padding */ -#define NAND_NO_PADDING 0x00000004 -/* Chip has cache program function */ -#define NAND_CACHEPRG 0x00000008 -/* Chip has copy back function */ -#define NAND_COPYBACK 0x00000010 -/* AND Chip which has 4 banks and a confusing page / block - * assignment. See Renesas datasheet for further information */ -#define NAND_IS_AND 0x00000020 -/* Chip has a array of 4 pages which can be read without - * additional ready /busy waits */ -#define NAND_4PAGE_ARRAY 0x00000040 - - - - - Constants for runtime options - - These constants are defined in nand.h. They are ored together to describe - the functionality. - -/* The hw ecc generator provides a syndrome instead a ecc value on read - * This can only work if we have the ecc bytes directly behind the - * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */ -#define NAND_HWECC_SYNDROME 0x00020000 - - - - - - - ECC selection constants - - Use these constants to select the ECC algorithm. - -/* No ECC. Usage is not recommended ! */ -#define NAND_ECC_NONE 0 -/* Software ECC 3 byte ECC per 256 Byte data */ -#define NAND_ECC_SOFT 1 -/* Hardware ECC 3 byte ECC per 256 Byte data */ -#define NAND_ECC_HW3_256 2 -/* Hardware ECC 3 byte ECC per 512 Byte data */ -#define NAND_ECC_HW3_512 3 -/* Hardware ECC 6 byte ECC per 512 Byte data */ -#define NAND_ECC_HW6_512 4 -/* Hardware ECC 6 byte ECC per 512 Byte data */ -#define NAND_ECC_HW8_512 6 - - - - - - Hardware control related constants - - These constants describe the requested hardware access function when - the boardspecific hardware control function is called - -/* Select the chip by setting nCE to low */ -#define NAND_CTL_SETNCE 1 -/* Deselect the chip by setting nCE to high */ -#define NAND_CTL_CLRNCE 2 -/* Select the command latch by setting CLE to high */ -#define NAND_CTL_SETCLE 3 -/* Deselect the command latch by setting CLE to low */ -#define NAND_CTL_CLRCLE 4 -/* Select the address latch by setting ALE to high */ -#define NAND_CTL_SETALE 5 -/* Deselect the address latch by setting ALE to low */ -#define NAND_CTL_CLRALE 6 -/* Set write protection by setting WP to high. Not used! */ -#define NAND_CTL_SETWP 7 -/* Clear write protection by setting WP to low. Not used! */ -#define NAND_CTL_CLRWP 8 - - - - - - Bad block table related constants - - These constants describe the options used for bad block - table descriptors. - -/* Options for the bad block table descriptors */ - -/* The number of bits used per block in the bbt on the device */ -#define NAND_BBT_NRBITS_MSK 0x0000000F -#define NAND_BBT_1BIT 0x00000001 -#define NAND_BBT_2BIT 0x00000002 -#define NAND_BBT_4BIT 0x00000004 -#define NAND_BBT_8BIT 0x00000008 -/* The bad block table is in the last good block of the device */ -#define NAND_BBT_LASTBLOCK 0x00000010 -/* The bbt is at the given page, else we must scan for the bbt */ -#define NAND_BBT_ABSPAGE 0x00000020 -/* bbt is stored per chip on multichip devices */ -#define NAND_BBT_PERCHIP 0x00000080 -/* bbt has a version counter at offset veroffs */ -#define NAND_BBT_VERSION 0x00000100 -/* Create a bbt if none axists */ -#define NAND_BBT_CREATE 0x00000200 -/* Write bbt if necessary */ -#define NAND_BBT_WRITE 0x00001000 -/* Read and write back block contents when writing bbt */ -#define NAND_BBT_SAVECONTENT 0x00002000 - - - - - - - - Structures - - This chapter contains the autogenerated documentation of the structures which are - used in the NAND driver and might be relevant for a driver developer. Each - struct member has a short description which is marked with an [XXX] identifier. - See the chapter "Documentation hints" for an explanation. - -!Iinclude/linux/mtd/nand.h - - - - Public Functions Provided - - This chapter contains the autogenerated documentation of the NAND kernel API functions - which are exported. Each function has a short description which is marked with an [XXX] identifier. - See the chapter "Documentation hints" for an explanation. - -!Edrivers/mtd/nand/nand_base.c -!Edrivers/mtd/nand/nand_bbt.c -!Edrivers/mtd/nand/nand_ecc.c - - - - Internal Functions Provided - - This chapter contains the autogenerated documentation of the NAND driver internal functions. - Each function has a short description which is marked with an [XXX] identifier. - See the chapter "Documentation hints" for an explanation. - The functions marked with [DEFAULT] might be relevant for a board driver developer. - -!Idrivers/mtd/nand/nand_base.c -!Idrivers/mtd/nand/nand_bbt.c - - - - - Credits - - The following people have contributed to the NAND driver: - - Steven J. Hillsjhill@realitydiluted.com - David Woodhousedwmw2@infradead.org - Thomas Gleixnertglx@linutronix.de - - A lot of users have provided bugfixes, improvements and helping hands for testing. - Thanks a lot. - - - The following people have contributed to this document: - - Thomas Gleixnertglx@linutronix.de - - - -
diff --git a/Documentation/DocBook/networking.tmpl b/Documentation/DocBook/networking.tmpl deleted file mode 100644 index 29df25016c7c..000000000000 --- a/Documentation/DocBook/networking.tmpl +++ /dev/null @@ -1,111 +0,0 @@ - - - - - - Linux Networking and Network Devices APIs - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - - - - - - - Linux Networking - Networking Base Types -!Iinclude/linux/net.h - - Socket Buffer Functions -!Iinclude/linux/skbuff.h -!Iinclude/net/sock.h -!Enet/socket.c -!Enet/core/skbuff.c -!Enet/core/sock.c -!Enet/core/datagram.c -!Enet/core/stream.c - - Socket Filter -!Enet/core/filter.c - - Generic Network Statistics -!Iinclude/uapi/linux/gen_stats.h -!Enet/core/gen_stats.c -!Enet/core/gen_estimator.c - - SUN RPC subsystem - -!Enet/sunrpc/xdr.c -!Enet/sunrpc/svc_xprt.c -!Enet/sunrpc/xprt.c -!Enet/sunrpc/sched.c -!Enet/sunrpc/socklib.c -!Enet/sunrpc/stats.c -!Enet/sunrpc/rpc_pipe.c -!Enet/sunrpc/rpcb_clnt.c -!Enet/sunrpc/clnt.c - - WiMAX -!Enet/wimax/op-msg.c -!Enet/wimax/op-reset.c -!Enet/wimax/op-rfkill.c -!Enet/wimax/stack.c -!Iinclude/net/wimax.h -!Iinclude/uapi/linux/wimax.h - - - - - Network device support - Driver Support -!Enet/core/dev.c -!Enet/ethernet/eth.c -!Enet/sched/sch_generic.c -!Iinclude/linux/etherdevice.h -!Iinclude/linux/netdevice.h - - PHY Support -!Edrivers/net/phy/phy.c -!Idrivers/net/phy/phy.c -!Edrivers/net/phy/phy_device.c -!Idrivers/net/phy/phy_device.c -!Edrivers/net/phy/mdio_bus.c -!Idrivers/net/phy/mdio_bus.c - - - - - diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl deleted file mode 100644 index ac3cca3399a1..000000000000 --- a/Documentation/DocBook/rapidio.tmpl +++ /dev/null @@ -1,155 +0,0 @@ - - - ]> - - - - RapidIO Subsystem Guide - - - - Matt - Porter - -
- mporter@kernel.crashing.org - mporter@mvista.com -
-
-
-
- - - 2005 - MontaVista Software, Inc. - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - -
- - - - - Introduction - - RapidIO is a high speed switched fabric interconnect with - features aimed at the embedded market. RapidIO provides - support for memory-mapped I/O as well as message-based - transactions over the switched fabric network. RapidIO has - a standardized discovery mechanism not unlike the PCI bus - standard that allows simple detection of devices in a - network. - - - This documentation is provided for developers intending - to support RapidIO on new architectures, write new drivers, - or to understand the subsystem internals. - - - - - Known Bugs and Limitations - - - Bugs - None. ;) - - - Limitations - - - Access/management of RapidIO memory regions is not supported - Multiple host enumeration is not supported - - - - - - - RapidIO driver interface - - Drivers are provided a set of calls in order - to interface with the subsystem to gather info - on devices, request/map memory region resources, - and manage mailboxes/doorbells. - - - Functions -!Iinclude/linux/rio_drv.h -!Edrivers/rapidio/rio-driver.c -!Edrivers/rapidio/rio.c - - - - - Internals - - - This chapter contains the autogenerated documentation of the RapidIO - subsystem. - - - Structures -!Iinclude/linux/rio.h - - Enumeration and Discovery -!Idrivers/rapidio/rio-scan.c - - Driver functionality -!Idrivers/rapidio/rio.c -!Idrivers/rapidio/rio-access.c - - Device model support -!Idrivers/rapidio/rio-driver.c - - PPC32 support -!Iarch/powerpc/sysdev/fsl_rio.c - - - - - Credits - - The following people have contributed to the RapidIO - subsystem directly or indirectly: - - Matt Portermporter@kernel.crashing.org - Randy Vinsonrvinson@mvista.com - Dan Malekdan@embeddedalley.com - - - - The following people have contributed to this document: - - Matt Portermporter@kernel.crashing.org - - - -
diff --git a/Documentation/DocBook/s390-drivers.tmpl b/Documentation/DocBook/s390-drivers.tmpl deleted file mode 100644 index 95bfc12e5439..000000000000 --- a/Documentation/DocBook/s390-drivers.tmpl +++ /dev/null @@ -1,161 +0,0 @@ - - - - - - Writing s390 channel device drivers - - - - Cornelia - Huck - -
- cornelia.huck@de.ibm.com -
-
-
-
- - - 2007 - IBM Corp. - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - -
- - - - - Introduction - - This document describes the interfaces available for device drivers that - drive s390 based channel attached I/O devices. This includes interfaces for - interaction with the hardware and interfaces for interacting with the - common driver core. Those interfaces are provided by the s390 common I/O - layer. - - - The document assumes a familarity with the technical terms associated - with the s390 channel I/O architecture. For a description of this - architecture, please refer to the "z/Architecture: Principles of - Operation", IBM publication no. SA22-7832. - - - While most I/O devices on a s390 system are typically driven through the - channel I/O mechanism described here, there are various other methods - (like the diag interface). These are out of the scope of this document. - - - Some additional information can also be found in the kernel source - under Documentation/s390/driver-model.txt. - - - - The ccw bus - - The ccw bus typically contains the majority of devices available to - a s390 system. Named after the channel command word (ccw), the basic - command structure used to address its devices, the ccw bus contains - so-called channel attached devices. They are addressed via I/O - subchannels, visible on the css bus. A device driver for - channel-attached devices, however, will never interact with the - subchannel directly, but only via the I/O device on the ccw bus, - the ccw device. - - - I/O functions for channel-attached devices - - Some hardware structures have been translated into C structures for use - by the common I/O layer and device drivers. For more information on - the hardware structures represented here, please consult the Principles - of Operation. - -!Iarch/s390/include/asm/cio.h - - - ccw devices - - Devices that want to initiate channel I/O need to attach to the ccw bus. - Interaction with the driver core is done via the common I/O layer, which - provides the abstractions of ccw devices and ccw device drivers. - - - The functions that initiate or terminate channel I/O all act upon a - ccw device structure. Device drivers must not bypass those functions - or strange side effects may happen. - -!Iarch/s390/include/asm/ccwdev.h -!Edrivers/s390/cio/device.c -!Edrivers/s390/cio/device_ops.c - - - The channel-measurement facility - - The channel-measurement facility provides a means to collect - measurement data which is made available by the channel subsystem - for each channel attached device. - -!Iarch/s390/include/asm/cmb.h -!Edrivers/s390/cio/cmf.c - - - - - The ccwgroup bus - - The ccwgroup bus only contains artificial devices, created by the user. - Many networking devices (e.g. qeth) are in fact composed of several - ccw devices (like read, write and data channel for qeth). The - ccwgroup bus provides a mechanism to create a meta-device which - contains those ccw devices as slave devices and can be associated - with the netdevice. - - - ccw group devices -!Iarch/s390/include/asm/ccwgroup.h -!Edrivers/s390/cio/ccwgroup.c - - - - - Generic interfaces - - Some interfaces are available to other drivers that do not necessarily - have anything to do with the busses described above, but still are - indirectly using basic infrastructure in the common I/O layer. - One example is the support for adapter interrupts. - -!Edrivers/s390/cio/airq.c - - -
diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl deleted file mode 100644 index 4b9b9b286cea..000000000000 --- a/Documentation/DocBook/scsi.tmpl +++ /dev/null @@ -1,409 +0,0 @@ - - - - - - SCSI Interfaces Guide - - - - James - Bottomley - -
- James.Bottomley@hansenpartnership.com -
-
-
- - - Rob - Landley - -
- rob@landley.net -
-
-
- -
- - - 2007 - Linux Foundation - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - For more details see the file COPYING in the source - distribution of Linux. - - -
- - - - - Introduction - - Protocol vs bus - - Once upon a time, the Small Computer Systems Interface defined both - a parallel I/O bus and a data protocol to connect a wide variety of - peripherals (disk drives, tape drives, modems, printers, scanners, - optical drives, test equipment, and medical devices) to a host - computer. - - - Although the old parallel (fast/wide/ultra) SCSI bus has largely - fallen out of use, the SCSI command set is more widely used than ever - to communicate with devices over a number of different busses. - - - The SCSI protocol - is a big-endian peer-to-peer packet based protocol. SCSI commands - are 6, 10, 12, or 16 bytes long, often followed by an associated data - payload. - - - SCSI commands can be transported over just about any kind of bus, and - are the default protocol for storage devices attached to USB, SATA, - SAS, Fibre Channel, FireWire, and ATAPI devices. SCSI packets are - also commonly exchanged over Infiniband, - I20, TCP/IP - (iSCSI), even - Parallel - ports. - - - - Design of the Linux SCSI subsystem - - The SCSI subsystem uses a three layer design, with upper, mid, and low - layers. Every operation involving the SCSI subsystem (such as reading - a sector from a disk) uses one driver at each of the 3 levels: one - upper layer driver, one lower layer driver, and the SCSI midlayer. - - - The SCSI upper layer provides the interface between userspace and the - kernel, in the form of block and char device nodes for I/O and - ioctl(). The SCSI lower layer contains drivers for specific hardware - devices. - - - In between is the SCSI mid-layer, analogous to a network routing - layer such as the IPv4 stack. The SCSI mid-layer routes a packet - based data protocol between the upper layer's /dev nodes and the - corresponding devices in the lower layer. It manages command queues, - provides error handling and power management functions, and responds - to ioctl() requests. - - - - - - SCSI upper layer - - The upper layer supports the user-kernel interface by providing - device nodes. - - - sd (SCSI Disk) - sd (sd_mod.o) - - - - sr (SCSI CD-ROM) - sr (sr_mod.o) - - - st (SCSI Tape) - st (st.o) - - - sg (SCSI Generic) - sg (sg.o) - - - ch (SCSI Media Changer) - ch (ch.c) - - - - - SCSI mid layer - - - SCSI midlayer implementation - - include/scsi/scsi_device.h - - -!Iinclude/scsi/scsi_device.h - - - - drivers/scsi/scsi.c - Main file for the SCSI midlayer. -!Edrivers/scsi/scsi.c - - - drivers/scsi/scsicam.c - - SCSI - Common Access Method support functions, for use with - HDIO_GETGEO, etc. - -!Edrivers/scsi/scsicam.c - - - drivers/scsi/scsi_error.c - Common SCSI error/timeout handling routines. -!Edrivers/scsi/scsi_error.c - - - drivers/scsi/scsi_devinfo.c - - Manage scsi_dev_info_list, which tracks blacklisted and whitelisted - devices. - -!Idrivers/scsi/scsi_devinfo.c - - - drivers/scsi/scsi_ioctl.c - - Handle ioctl() calls for SCSI devices. - -!Edrivers/scsi/scsi_ioctl.c - - - drivers/scsi/scsi_lib.c - - SCSI queuing library. - -!Edrivers/scsi/scsi_lib.c - - - drivers/scsi/scsi_lib_dma.c - - SCSI library functions depending on DMA - (map and unmap scatter-gather lists). - -!Edrivers/scsi/scsi_lib_dma.c - - - drivers/scsi/scsi_module.c - - The file drivers/scsi/scsi_module.c contains legacy support for - old-style host templates. It should never be used by any new driver. - - - - drivers/scsi/scsi_proc.c - - The functions in this file provide an interface between - the PROC file system and the SCSI device drivers - It is mainly used for debugging, statistics and to pass - information directly to the lowlevel driver. - - I.E. plumbing to manage /proc/scsi/* - -!Idrivers/scsi/scsi_proc.c - - - drivers/scsi/scsi_netlink.c - - Infrastructure to provide async events from transports to userspace - via netlink, using a single NETLINK_SCSITRANSPORT protocol for all - transports. - - See the - original patch submission for more details. - -!Idrivers/scsi/scsi_netlink.c - - - drivers/scsi/scsi_scan.c - - Scan a host to determine which (if any) devices are attached. - - The general scanning/probing algorithm is as follows, exceptions are - made to it depending on device specific flags, compilation options, - and global variable (boot or module load time) settings. - - A specific LUN is scanned via an INQUIRY command; if the LUN has a - device attached, a scsi_device is allocated and setup for it. - - For every id of every channel on the given host, start by scanning - LUN 0. Skip hosts that don't respond at all to a scan of LUN 0. - Otherwise, if LUN 0 has a device attached, allocate and setup a - scsi_device for it. If target is SCSI-3 or up, issue a REPORT LUN, - and scan all of the LUNs returned by the REPORT LUN; else, - sequentially scan LUNs up until some maximum is reached, or a LUN is - seen that cannot have a device attached to it. - -!Idrivers/scsi/scsi_scan.c - - - drivers/scsi/scsi_sysctl.c - - Set up the sysctl entry: "/dev/scsi/logging_level" - (DEV_SCSI_LOGGING_LEVEL) which sets/returns scsi_logging_level. - - - - drivers/scsi/scsi_sysfs.c - - SCSI sysfs interface routines. - -!Edrivers/scsi/scsi_sysfs.c - - - drivers/scsi/hosts.c - - mid to lowlevel SCSI driver interface - -!Edrivers/scsi/hosts.c - - - drivers/scsi/constants.c - - mid to lowlevel SCSI driver interface - -!Edrivers/scsi/constants.c - - - - - Transport classes - - Transport classes are service libraries for drivers in the SCSI - lower layer, which expose transport attributes in sysfs. - - - Fibre Channel transport - - The file drivers/scsi/scsi_transport_fc.c defines transport attributes - for Fibre Channel. - -!Edrivers/scsi/scsi_transport_fc.c - - - iSCSI transport class - - The file drivers/scsi/scsi_transport_iscsi.c defines transport - attributes for the iSCSI class, which sends SCSI packets over TCP/IP - connections. - -!Edrivers/scsi/scsi_transport_iscsi.c - - - Serial Attached SCSI (SAS) transport class - - The file drivers/scsi/scsi_transport_sas.c defines transport - attributes for Serial Attached SCSI, a variant of SATA aimed at - large high-end systems. - - - The SAS transport class contains common code to deal with SAS HBAs, - an aproximated representation of SAS topologies in the driver model, - and various sysfs attributes to expose these topologies and management - interfaces to userspace. - - - In addition to the basic SCSI core objects this transport class - introduces two additional intermediate objects: The SAS PHY - as represented by struct sas_phy defines an "outgoing" PHY on - a SAS HBA or Expander, and the SAS remote PHY represented by - struct sas_rphy defines an "incoming" PHY on a SAS Expander or - end device. Note that this is purely a software concept, the - underlying hardware for a PHY and a remote PHY is the exactly - the same. - - - There is no concept of a SAS port in this code, users can see - what PHYs form a wide port based on the port_identifier attribute, - which is the same for all PHYs in a port. - -!Edrivers/scsi/scsi_transport_sas.c - - - SATA transport class - - The SATA transport is handled by libata, which has its own book of - documentation in this directory. - - - - Parallel SCSI (SPI) transport class - - The file drivers/scsi/scsi_transport_spi.c defines transport - attributes for traditional (fast/wide/ultra) SCSI busses. - -!Edrivers/scsi/scsi_transport_spi.c - - - SCSI RDMA (SRP) transport class - - The file drivers/scsi/scsi_transport_srp.c defines transport - attributes for SCSI over Remote Direct Memory Access. - -!Edrivers/scsi/scsi_transport_srp.c - - - - - - - SCSI lower layer - - Host Bus Adapter transport types - - Many modern device controllers use the SCSI command set as a protocol to - communicate with their devices through many different types of physical - connections. - - - In SCSI language a bus capable of carrying SCSI commands is - called a "transport", and a controller connecting to such a bus is - called a "host bus adapter" (HBA). - - - Debug transport - - The file drivers/scsi/scsi_debug.c simulates a host adapter with a - variable number of disks (or disk like devices) attached, sharing a - common amount of RAM. Does a lot of checking to make sure that we are - not getting blocks mixed up, and panics the kernel if anything out of - the ordinary is seen. - - - To be more realistic, the simulated devices have the transport - attributes of SAS disks. - - - For documentation see - http://sg.danny.cz/sg/sdebug26.html - - - - - todo - Parallel (fast/wide/ultra) SCSI, USB, SATA, - SAS, Fibre Channel, FireWire, ATAPI devices, Infiniband, - I20, iSCSI, Parallel ports, netlink... - - - - -
diff --git a/Documentation/DocBook/sh.tmpl b/Documentation/DocBook/sh.tmpl deleted file mode 100644 index 4a38f604fa66..000000000000 --- a/Documentation/DocBook/sh.tmpl +++ /dev/null @@ -1,105 +0,0 @@ - - - - - - SuperH Interfaces Guide - - - - Paul - Mundt - -
- lethal@linux-sh.org -
-
-
-
- - - 2008-2010 - Paul Mundt - - - 2008-2010 - Renesas Technology Corp. - - - 2010 - Renesas Electronics Corp. - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - -
- - - - - Memory Management - - SH-4 - - Store Queue API -!Earch/sh/kernel/cpu/sh4/sq.c - - - - SH-5 - - TLB Interfaces -!Iarch/sh/mm/tlb-sh5.c -!Iarch/sh/include/asm/tlb_64.h - - - - - Machine Specific Interfaces - - mach-dreamcast -!Iarch/sh/boards/mach-dreamcast/rtc.c - - - mach-x3proto -!Earch/sh/boards/mach-x3proto/ilsel.c - - - - Busses - - SuperHyway -!Edrivers/sh/superhyway/superhyway.c - - - - Maple -!Edrivers/sh/maple/maple.c - - -
diff --git a/Documentation/DocBook/stylesheet.xsl b/Documentation/DocBook/stylesheet.xsl deleted file mode 100644 index 3bf4ecf3d760..000000000000 --- a/Documentation/DocBook/stylesheet.xsl +++ /dev/null @@ -1,11 +0,0 @@ - - -1 -ansi -80 -0 - -1 -2 -1 - diff --git a/Documentation/DocBook/w1.tmpl b/Documentation/DocBook/w1.tmpl deleted file mode 100644 index b0228d4c81bb..000000000000 --- a/Documentation/DocBook/w1.tmpl +++ /dev/null @@ -1,101 +0,0 @@ - - - - - - W1: Dallas' 1-wire bus - - - - David - Fries - -
- David@Fries.net -
-
-
- -
- - - 2013 - - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - For more details see the file COPYING in the source - distribution of Linux. - - -
- - - - - W1 API internal to the kernel - - - W1 API internal to the kernel - - drivers/w1/w1.h - W1 core functions. -!Idrivers/w1/w1.h - - - - drivers/w1/w1.c - W1 core functions. -!Idrivers/w1/w1.c - - - - drivers/w1/w1_family.h - Allows registering device family operations. -!Idrivers/w1/w1_family.h - - - - drivers/w1/w1_family.c - Allows registering device family operations. -!Edrivers/w1/w1_family.c - - - - drivers/w1/w1_int.c - W1 internal initialization for master devices. -!Edrivers/w1/w1_int.c - - - - drivers/w1/w1_netlink.h - W1 external netlink API structures and commands. -!Idrivers/w1/w1_netlink.h - - - - drivers/w1/w1_io.c - W1 input/output. -!Edrivers/w1/w1_io.c -!Idrivers/w1/w1_io.c - - - - - - - -
diff --git a/Documentation/DocBook/z8530book.tmpl b/Documentation/DocBook/z8530book.tmpl deleted file mode 100644 index 6f3883be877e..000000000000 --- a/Documentation/DocBook/z8530book.tmpl +++ /dev/null @@ -1,371 +0,0 @@ - - - - - - Z8530 Programming Guide - - - - Alan - Cox - -
- alan@lxorguk.ukuu.org.uk -
-
-
-
- - - 2000 - Alan Cox - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - -
- - - - - Introduction - - The Z85x30 family synchronous/asynchronous controller chips are - used on a large number of cheap network interface cards. The - kernel provides a core interface layer that is designed to make - it easy to provide WAN services using this chip. - - - The current driver only support synchronous operation. Merging the - asynchronous driver support into this code to allow any Z85x30 - device to be used as both a tty interface and as a synchronous - controller is a project for Linux post the 2.4 release - - - - - Driver Modes - - The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices - in three different modes. Each mode can be applied to an individual - channel on the chip (each chip has two channels). - - - The PIO synchronous mode supports the most common Z8530 wiring. Here - the chip is interface to the I/O and interrupt facilities of the - host machine but not to the DMA subsystem. When running PIO the - Z8530 has extremely tight timing requirements. Doing high speeds, - even with a Z85230 will be tricky. Typically you should expect to - achieve at best 9600 baud with a Z8C530 and 64Kbits with a Z85230. - - - The DMA mode supports the chip when it is configured to use dual DMA - channels on an ISA bus. The better cards tend to support this mode - of operation for a single channel. With DMA running the Z85230 tops - out when it starts to hit ISA DMA constraints at about 512Kbits. It - is worth noting here that many PC machines hang or crash when the - chip is driven fast enough to hold the ISA bus solid. - - - Transmit DMA mode uses a single DMA channel. The DMA channel is used - for transmission as the transmit FIFO is smaller than the receive - FIFO. it gives better performance than pure PIO mode but is nowhere - near as ideal as pure DMA mode. - - - - - Using the Z85230 driver - - The Z85230 driver provides the back end interface to your board. To - configure a Z8530 interface you need to detect the board and to - identify its ports and interrupt resources. It is also your problem - to verify the resources are available. - - - Having identified the chip you need to fill in a struct z8530_dev, - which describes each chip. This object must exist until you finally - shutdown the board. Firstly zero the active field. This ensures - nothing goes off without you intending it. The irq field should - be set to the interrupt number of the chip. (Each chip has a single - interrupt source rather than each channel). You are responsible - for allocating the interrupt line. The interrupt handler should be - set to z8530_interrupt. The device id should - be set to the z8530_dev structure pointer. Whether the interrupt can - be shared or not is board dependent, and up to you to initialise. - - - The structure holds two channel structures. - Initialise chanA.ctrlio and chanA.dataio with the address of the - control and data ports. You can or this with Z8530_PORT_SLEEP to - indicate your interface needs the 5uS delay for chip settling done - in software. The PORT_SLEEP option is architecture specific. Other - flags may become available on future platforms, eg for MMIO. - Initialise the chanA.irqs to &z8530_nop to start the chip up - as disabled and discarding interrupt events. This ensures that - stray interrupts will be mopped up and not hang the bus. Set - chanA.dev to point to the device structure itself. The - private and name field you may use as you wish. The private field - is unused by the Z85230 layer. The name is used for error reporting - and it may thus make sense to make it match the network name. - - - Repeat the same operation with the B channel if your chip has - both channels wired to something useful. This isn't always the - case. If it is not wired then the I/O values do not matter, but - you must initialise chanB.dev. - - - If your board has DMA facilities then initialise the txdma and - rxdma fields for the relevant channels. You must also allocate the - ISA DMA channels and do any necessary board level initialisation - to configure them. The low level driver will do the Z8530 and - DMA controller programming but not board specific magic. - - - Having initialised the device you can then call - z8530_init. This will probe the chip and - reset it into a known state. An identification sequence is then - run to identify the chip type. If the checks fail to pass the - function returns a non zero error code. Typically this indicates - that the port given is not valid. After this call the - type field of the z8530_dev structure is initialised to either - Z8530, Z85C30 or Z85230 according to the chip found. - - - Once you have called z8530_init you can also make use of the utility - function z8530_describe. This provides a - consistent reporting format for the Z8530 devices, and allows all - the drivers to provide consistent reporting. - - - - - Attaching Network Interfaces - - If you wish to use the network interface facilities of the driver, - then you need to attach a network device to each channel that is - present and in use. In addition to use the generic HDLC - you need to follow some additional plumbing rules. They may seem - complex but a look at the example hostess_sv11 driver should - reassure you. - - - The network device used for each channel should be pointed to by - the netdevice field of each channel. The hdlc-> priv field of the - network device points to your private data - you will need to be - able to find your private data from this. - - - The way most drivers approach this particular problem is to - create a structure holding the Z8530 device definition and - put that into the private field of the network device. The - network device fields of the channels then point back to the - network devices. - - - If you wish to use the generic HDLC then you need to register - the HDLC device. - - - Before you register your network device you will also need to - provide suitable handlers for most of the network device callbacks. - See the network device documentation for more details on this. - - - - - Configuring And Activating The Port - - The Z85230 driver provides helper functions and tables to load the - port registers on the Z8530 chips. When programming the register - settings for a channel be aware that the documentation recommends - initialisation orders. Strange things happen when these are not - followed. - - - z8530_channel_load takes an array of - pairs of initialisation values in an array of u8 type. The first - value is the Z8530 register number. Add 16 to indicate the alternate - register bank on the later chips. The array is terminated by a 255. - - - The driver provides a pair of public tables. The - z8530_hdlc_kilostream table is for the UK 'Kilostream' service and - also happens to cover most other end host configurations. The - z8530_hdlc_kilostream_85230 table is the same configuration using - the enhancements of the 85230 chip. The configuration loaded is - standard NRZ encoded synchronous data with HDLC bitstuffing. All - of the timing is taken from the other end of the link. - - - When writing your own tables be aware that the driver internally - tracks register values. It may need to reload values. You should - therefore be sure to set registers 1-7, 9-11, 14 and 15 in all - configurations. Where the register settings depend on DMA selection - the driver will update the bits itself when you open or close. - Loading a new table with the interface open is not recommended. - - - There are three standard configurations supported by the core - code. In PIO mode the interface is programmed up to use - interrupt driven PIO. This places high demands on the host processor - to avoid latency. The driver is written to take account of latency - issues but it cannot avoid latencies caused by other drivers, - notably IDE in PIO mode. Because the drivers allocate buffers you - must also prevent MTU changes while the port is open. - - - Once the port is open it will call the rx_function of each channel - whenever a completed packet arrived. This is invoked from - interrupt context and passes you the channel and a network - buffer (struct sk_buff) holding the data. The data includes - the CRC bytes so most users will want to trim the last two - bytes before processing the data. This function is very timing - critical. When you wish to simply discard data the support - code provides the function z8530_null_rx - to discard the data. - - - To active PIO mode sending and receiving the - z8530_sync_open is called. This expects to be passed - the network device and the channel. Typically this is called from - your network device open callback. On a failure a non zero error - status is returned. The z8530_sync_close - function shuts down a PIO channel. This must be done before the - channel is opened again and before the driver shuts down - and unloads. - - - The ideal mode of operation is dual channel DMA mode. Here the - kernel driver will configure the board for DMA in both directions. - The driver also handles ISA DMA issues such as controller - programming and the memory range limit for you. This mode is - activated by calling the z8530_sync_dma_open - function. On failure a non zero error value is returned. - Once this mode is activated it can be shut down by calling the - z8530_sync_dma_close. You must call the close - function matching the open mode you used. - - - The final supported mode uses a single DMA channel to drive the - transmit side. As the Z85C30 has a larger FIFO on the receive - channel this tends to increase the maximum speed a little. - This is activated by calling the z8530_sync_txdma_open - . This returns a non zero error code on failure. The - z8530_sync_txdma_close function closes down - the Z8530 interface from this mode. - - - - - Network Layer Functions - - The Z8530 layer provides functions to queue packets for - transmission. The driver internally buffers the frame currently - being transmitted and one further frame (in order to keep back - to back transmission running). Any further buffering is up to - the caller. - - - The function z8530_queue_xmit takes a network - buffer in sk_buff format and queues it for transmission. The - caller must provide the entire packet with the exception of the - bitstuffing and CRC. This is normally done by the caller via - the generic HDLC interface layer. It returns 0 if the buffer has been - queued and non zero values for queue full. If the function accepts - the buffer it becomes property of the Z8530 layer and the caller - should not free it. - - - The function z8530_get_stats returns a pointer - to an internally maintained per interface statistics block. This - provides most of the interface code needed to implement the network - layer get_stats callback. - - - - - Porting The Z8530 Driver - - The Z8530 driver is written to be portable. In DMA mode it makes - assumptions about the use of ISA DMA. These are probably warranted - in most cases as the Z85230 in particular was designed to glue to PC - type machines. The PIO mode makes no real assumptions. - - - Should you need to retarget the Z8530 driver to another architecture - the only code that should need changing are the port I/O functions. - At the moment these assume PC I/O port accesses. This may not be - appropriate for all platforms. Replacing - z8530_read_port and z8530_write_port - is intended to be all that is required to port this - driver layer. - - - - - Known Bugs And Assumptions - - - Interrupt Locking - - - The locking in the driver is done via the global cli/sti lock. This - makes for relatively poor SMP performance. Switching this to use a - per device spin lock would probably materially improve performance. - - - - Occasional Failures - - - We have reports of occasional failures when run for very long - periods of time and the driver starts to receive junk frames. At - the moment the cause of this is not clear. - - - - - - - - - Public Functions Provided -!Edrivers/net/wan/z85230.c - - - - Internal Functions -!Idrivers/net/wan/z85230.c - - -
diff --git a/Documentation/Makefile b/Documentation/Makefile index c2a469112c37..a42320385df3 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -1 +1,126 @@ +# -*- makefile -*- +# Makefile for Sphinx documentation +# + subdir-y := + +# You can set these variables from the command line. +SPHINXBUILD = sphinx-build +SPHINXOPTS = +SPHINXDIRS = . +_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/conf.py,%,$(wildcard $(srctree)/Documentation/*/conf.py)) +SPHINX_CONF = conf.py +PAPER = +BUILDDIR = $(obj)/output +PDFLATEX = xelatex +LATEXOPTS = -interaction=batchmode + +# User-friendly check for sphinx-build +HAVE_SPHINX := $(shell if which $(SPHINXBUILD) >/dev/null 2>&1; then echo 1; else echo 0; fi) + +ifeq ($(HAVE_SPHINX),0) + +.DEFAULT: + $(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.) + @echo " SKIP Sphinx $@ target." + +else # HAVE_SPHINX + +# User-friendly check for pdflatex +HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi) + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +KERNELDOC = $(srctree)/scripts/kernel-doc +KERNELDOC_CONF = -D kerneldoc_srctree=$(srctree) -D kerneldoc_bin=$(KERNELDOC) +ALLSPHINXOPTS = $(KERNELDOC_CONF) $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +# commands; the 'cmd' from scripts/Kbuild.include is not *loopable* +loop_cmd = $(echo-cmd) $(cmd_$(1)) || exit; + +# $2 sphinx builder e.g. "html" +# $3 name of the build subfolder / e.g. "media", used as: +# * dest folder relative to $(BUILDDIR) and +# * cache folder relative to $(BUILDDIR)/.doctrees +# $4 dest subfolder e.g. "man" for man pages at media/man +# $5 reST source folder relative to $(srctree)/$(src), +# e.g. "media" for the linux-tv book-set at ./Documentation/media + +quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4) + cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media $2 && \ + PYTHONDONTWRITEBYTECODE=1 \ + BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(srctree)/$(src)/$5/$(SPHINX_CONF)) \ + $(SPHINXBUILD) \ + -b $2 \ + -c $(abspath $(srctree)/$(src)) \ + -d $(abspath $(BUILDDIR)/.doctrees/$3) \ + -D version=$(KERNELVERSION) -D release=$(KERNELRELEASE) \ + $(ALLSPHINXOPTS) \ + $(abspath $(srctree)/$(src)/$5) \ + $(abspath $(BUILDDIR)/$3/$4) + +htmldocs: + @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var))) + +linkcheckdocs: + @$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var))) + +latexdocs: + @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var))) + +ifeq ($(HAVE_PDFLATEX),0) + +pdfdocs: + $(warning The '$(PDFLATEX)' command was not found. Make sure you have it installed and in PATH to produce PDF output.) + @echo " SKIP Sphinx $@ target." + +else # HAVE_PDFLATEX + +pdfdocs: latexdocs + $(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;) + +endif # HAVE_PDFLATEX + +epubdocs: + @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var))) + +xmldocs: + @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var))) + +endif # HAVE_SPHINX + +# The following targets are independent of HAVE_SPHINX, and the rules should +# work or silently pass without Sphinx. + +# no-ops for the Sphinx toolchain +sgmldocs: + @: +psdocs: + @: +mandocs: + @: +installmandocs: + @: + +cleandocs: + $(Q)rm -rf $(BUILDDIR) + $(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean + +dochelp: + @echo ' Linux kernel internal documentation in different formats from ReST:' + @echo ' htmldocs - HTML' + @echo ' latexdocs - LaTeX' + @echo ' pdfdocs - PDF' + @echo ' epubdocs - EPUB' + @echo ' xmldocs - XML' + @echo ' linkcheckdocs - check for broken external links (will connect to external hosts)' + @echo ' cleandocs - clean all generated files' + @echo + @echo ' make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2' + @echo ' valid values for SPHINXDIRS are: $(_SPHINXDIRS)' + @echo + @echo ' make SPHINX_CONF={conf-file} [target] use *additional* sphinx-build' + @echo ' configuration. This is e.g. useful to build with nit-picking config.' diff --git a/Documentation/Makefile.sphinx b/Documentation/Makefile.sphinx deleted file mode 100644 index bcf529f6cf9b..000000000000 --- a/Documentation/Makefile.sphinx +++ /dev/null @@ -1,130 +0,0 @@ -# -*- makefile -*- -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXBUILD = sphinx-build -SPHINXOPTS = -SPHINXDIRS = . -_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/conf.py,%,$(wildcard $(srctree)/Documentation/*/conf.py)) -SPHINX_CONF = conf.py -PAPER = -BUILDDIR = $(obj)/output -PDFLATEX = xelatex -LATEXOPTS = -interaction=batchmode - -# User-friendly check for sphinx-build -HAVE_SPHINX := $(shell if which $(SPHINXBUILD) >/dev/null 2>&1; then echo 1; else echo 0; fi) - -ifeq ($(HAVE_SPHINX),0) - -.DEFAULT: - $(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.) - @echo " SKIP Sphinx $@ target." - -else ifneq ($(DOCBOOKS),) - -# Skip Sphinx build if the user explicitly requested DOCBOOKS. -.DEFAULT: - @echo " SKIP Sphinx $@ target (DOCBOOKS specified)." - -else # HAVE_SPHINX - -# User-friendly check for pdflatex -HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi) - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -KERNELDOC = $(srctree)/scripts/kernel-doc -KERNELDOC_CONF = -D kerneldoc_srctree=$(srctree) -D kerneldoc_bin=$(KERNELDOC) -ALLSPHINXOPTS = $(KERNELDOC_CONF) $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -# commands; the 'cmd' from scripts/Kbuild.include is not *loopable* -loop_cmd = $(echo-cmd) $(cmd_$(1)) || exit; - -# $2 sphinx builder e.g. "html" -# $3 name of the build subfolder / e.g. "media", used as: -# * dest folder relative to $(BUILDDIR) and -# * cache folder relative to $(BUILDDIR)/.doctrees -# $4 dest subfolder e.g. "man" for man pages at media/man -# $5 reST source folder relative to $(srctree)/$(src), -# e.g. "media" for the linux-tv book-set at ./Documentation/media - -quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4) - cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media $2 && \ - PYTHONDONTWRITEBYTECODE=1 \ - BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(srctree)/$(src)/$5/$(SPHINX_CONF)) \ - $(SPHINXBUILD) \ - -b $2 \ - -c $(abspath $(srctree)/$(src)) \ - -d $(abspath $(BUILDDIR)/.doctrees/$3) \ - -D version=$(KERNELVERSION) -D release=$(KERNELRELEASE) \ - $(ALLSPHINXOPTS) \ - $(abspath $(srctree)/$(src)/$5) \ - $(abspath $(BUILDDIR)/$3/$4) - -htmldocs: - @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var))) - -linkcheckdocs: - @$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var))) - -latexdocs: - @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var))) - -ifeq ($(HAVE_PDFLATEX),0) - -pdfdocs: - $(warning The '$(PDFLATEX)' command was not found. Make sure you have it installed and in PATH to produce PDF output.) - @echo " SKIP Sphinx $@ target." - -else # HAVE_PDFLATEX - -pdfdocs: latexdocs - $(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;) - -endif # HAVE_PDFLATEX - -epubdocs: - @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var))) - -xmldocs: - @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var))) - -endif # HAVE_SPHINX - -# The following targets are independent of HAVE_SPHINX, and the rules should -# work or silently pass without Sphinx. - -# no-ops for the Sphinx toolchain -sgmldocs: - @: -psdocs: - @: -mandocs: - @: -installmandocs: - @: - -cleandocs: - $(Q)rm -rf $(BUILDDIR) - $(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean - -dochelp: - @echo ' Linux kernel internal documentation in different formats (Sphinx):' - @echo ' htmldocs - HTML' - @echo ' latexdocs - LaTeX' - @echo ' pdfdocs - PDF' - @echo ' epubdocs - EPUB' - @echo ' xmldocs - XML' - @echo ' linkcheckdocs - check for broken external links (will connect to external hosts)' - @echo ' cleandocs - clean all generated files' - @echo - @echo ' make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2' - @echo ' valid values for SPHINXDIRS are: $(_SPHINXDIRS)' - @echo - @echo ' make SPHINX_CONF={conf-file} [target] use *additional* sphinx-build' - @echo ' configuration. This is e.g. useful to build with nit-picking config.' diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 1e37138027a3..618e13d5e276 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -186,7 +186,7 @@ must disable interrupts while the lock is held. If the device sends a different interrupt, the driver will deadlock trying to recursively acquire the spinlock. Such deadlocks can be avoided by using spin_lock_irqsave() or spin_lock_irq() which disable local interrupts -and acquire the lock (see Documentation/DocBook/kernel-locking). +and acquire the lock (see Documentation/kernel-hacking/locking.rst). 4.5 How to tell whether MSI/MSI-X is enabled on a device diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst index b96e80f79e85..b5343c5aa224 100644 --- a/Documentation/admin-guide/README.rst +++ b/Documentation/admin-guide/README.rst @@ -55,12 +55,6 @@ Documentation contains information about the problems, which may result by upgrading your kernel. - - The Documentation/DocBook/ subdirectory contains several guides for - kernel developers and users. These guides can be rendered in a - number of formats: PostScript (.ps), PDF, HTML, & man-pages, among others. - After installation, ``make psdocs``, ``make pdfdocs``, ``make htmldocs``, - or ``make mandocs`` will render the documentation in the requested format. - Installing the kernel source ---------------------------- diff --git a/Documentation/conf.py b/Documentation/conf.py index bacf9d337c89..77d47bb1df1d 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -281,6 +281,7 @@ latex_elements = { \\definecolor{NoteColor}{RGB}{204,255,255} \\definecolor{WarningColor}{RGB}{255,204,204} \\definecolor{AttentionColor}{RGB}{255,255,204} + \\definecolor{ImportantColor}{RGB}{192,255,204} \\definecolor{OtherColor}{RGB}{204,204,204} \\newlength{\\mynoticelength} \\makeatletter\\newenvironment{coloredbox}[1]{% @@ -301,7 +302,12 @@ latex_elements = { \\ifthenelse% {\\equal{\\py@noticetype}{attention}}% {\\colorbox{AttentionColor}{\\usebox{\\@tempboxa}}}% - {\\colorbox{OtherColor}{\\usebox{\\@tempboxa}}}% + {% + \\ifthenelse% + {\\equal{\\py@noticetype}{important}}% + {\\colorbox{ImportantColor}{\\usebox{\\@tempboxa}}}% + {\\colorbox{OtherColor}{\\usebox{\\@tempboxa}}}% + }% }% }% }\\makeatother @@ -339,27 +345,42 @@ if major == 1 and minor > 3: # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). +# Sorted in alphabetical order latex_documents = [ - ('doc-guide/index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide', - 'The kernel development community', 'manual'), ('admin-guide/index', 'linux-user.tex', 'Linux Kernel User Documentation', 'The kernel development community', 'manual'), ('core-api/index', 'core-api.tex', 'The kernel core API manual', 'The kernel development community', 'manual'), + ('crypto/index', 'crypto-api.tex', 'Linux Kernel Crypto API manual', + 'The kernel development community', 'manual'), + ('dev-tools/index', 'dev-tools.tex', 'Development tools for the Kernel', + 'The kernel development community', 'manual'), + ('doc-guide/index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide', + 'The kernel development community', 'manual'), ('driver-api/index', 'driver-api.tex', 'The kernel driver API manual', 'The kernel development community', 'manual'), - ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', - 'The kernel development community', 'manual'), - ('kernel-documentation', 'kernel-documentation.tex', 'The Linux Kernel Documentation', - 'The kernel development community', 'manual'), - ('process/index', 'development-process.tex', 'Linux Kernel Development Documentation', + ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', 'The kernel development community', 'manual'), ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', 'The kernel development community', 'manual'), + ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', + 'The kernel development community', 'manual'), + ('kernel-hacking/index', 'kernel-hacking.tex', 'Unreliable Guide To Hacking The Linux Kernel', + 'The kernel development community', 'manual'), ('media/index', 'media.tex', 'Linux Media Subsystem Documentation', 'The kernel development community', 'manual'), + ('networking/index', 'networking.tex', 'Linux Networking Documentation', + 'The kernel development community', 'manual'), + ('process/index', 'development-process.tex', 'Linux Kernel Development Documentation', + 'The kernel development community', 'manual'), ('security/index', 'security.tex', 'The kernel security subsystem manual', 'The kernel development community', 'manual'), + ('sh/index', 'sh.tex', 'SuperH architecture implementation manual', + 'The kernel development community', 'manual'), + ('sound/index', 'sound.tex', 'Linux Sound Subsystem Documentation', + 'The kernel development community', 'manual'), + ('userspace-api/index', 'userspace-api.tex', 'The Linux kernel user-space API guide', + 'The kernel development community', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 62abd36bfffb..0606be3a3111 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -19,6 +19,7 @@ Core utilities workqueue genericirq flexible-arrays + librs Interfaces for kernel debugging =============================== diff --git a/Documentation/core-api/librs.rst b/Documentation/core-api/librs.rst new file mode 100644 index 000000000000..6010f5bc5bf9 --- /dev/null +++ b/Documentation/core-api/librs.rst @@ -0,0 +1,212 @@ +========================================== +Reed-Solomon Library Programming Interface +========================================== + +:Author: Thomas Gleixner + +Introduction +============ + +The generic Reed-Solomon Library provides encoding, decoding and error +correction functions. + +Reed-Solomon codes are used in communication and storage applications to +ensure data integrity. + +This documentation is provided for developers who want to utilize the +functions provided by the library. + +Known Bugs And Assumptions +========================== + +None. + +Usage +===== + +This chapter provides examples of how to use the library. + +Initializing +------------ + +The init function init_rs returns a pointer to an rs decoder structure, +which holds the necessary information for encoding, decoding and error +correction with the given polynomial. It either uses an existing +matching decoder or creates a new one. On creation all the lookup tables +for fast en/decoding are created. The function may take a while, so make +sure not to call it in critical code paths. + +:: + + /* the Reed Solomon control structure */ + static struct rs_control *rs_decoder; + + /* Symbolsize is 10 (bits) + * Primitive polynomial is x^10+x^3+1 + * first consecutive root is 0 + * primitive element to generate roots = 1 + * generator polynomial degree (number of roots) = 6 + */ + rs_decoder = init_rs (10, 0x409, 0, 1, 6); + + +Encoding +-------- + +The encoder calculates the Reed-Solomon code over the given data length +and stores the result in the parity buffer. Note that the parity buffer +must be initialized before calling the encoder. + +The expanded data can be inverted on the fly by providing a non-zero +inversion mask. The expanded data is XOR'ed with the mask. This is used +e.g. for FLASH ECC, where the all 0xFF is inverted to an all 0x00. The +Reed-Solomon code for all 0x00 is all 0x00. The code is inverted before +storing to FLASH so it is 0xFF too. This prevents that reading from an +erased FLASH results in ECC errors. + +The databytes are expanded to the given symbol size on the fly. There is +no support for encoding continuous bitstreams with a symbol size != 8 at +the moment. If it is necessary it should be not a big deal to implement +such functionality. + +:: + + /* Parity buffer. Size = number of roots */ + uint16_t par[6]; + /* Initialize the parity buffer */ + memset(par, 0, sizeof(par)); + /* Encode 512 byte in data8. Store parity in buffer par */ + encode_rs8 (rs_decoder, data8, 512, par, 0); + + +Decoding +-------- + +The decoder calculates the syndrome over the given data length and the +received parity symbols and corrects errors in the data. + +If a syndrome is available from a hardware decoder then the syndrome +calculation is skipped. + +The correction of the data buffer can be suppressed by providing a +correction pattern buffer and an error location buffer to the decoder. +The decoder stores the calculated error location and the correction +bitmask in the given buffers. This is useful for hardware decoders which +use a weird bit ordering scheme. + +The databytes are expanded to the given symbol size on the fly. There is +no support for decoding continuous bitstreams with a symbolsize != 8 at +the moment. If it is necessary it should be not a big deal to implement +such functionality. + +Decoding with syndrome calculation, direct data correction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + /* Parity buffer. Size = number of roots */ + uint16_t par[6]; + uint8_t data[512]; + int numerr; + /* Receive data */ + ..... + /* Receive parity */ + ..... + /* Decode 512 byte in data8.*/ + numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL); + + +Decoding with syndrome given by hardware decoder, direct data correction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + /* Parity buffer. Size = number of roots */ + uint16_t par[6], syn[6]; + uint8_t data[512]; + int numerr; + /* Receive data */ + ..... + /* Receive parity */ + ..... + /* Get syndrome from hardware decoder */ + ..... + /* Decode 512 byte in data8.*/ + numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL); + + +Decoding with syndrome given by hardware decoder, no direct data correction. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Note: It's not necessary to give data and received parity to the +decoder. + +:: + + /* Parity buffer. Size = number of roots */ + uint16_t par[6], syn[6], corr[8]; + uint8_t data[512]; + int numerr, errpos[8]; + /* Receive data */ + ..... + /* Receive parity */ + ..... + /* Get syndrome from hardware decoder */ + ..... + /* Decode 512 byte in data8.*/ + numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr); + for (i = 0; i < numerr; i++) { + do_error_correction_in_your_buffer(errpos[i], corr[i]); + } + + +Cleanup +------- + +The function free_rs frees the allocated resources, if the caller is +the last user of the decoder. + +:: + + /* Release resources */ + free_rs(rs_decoder); + + +Structures +========== + +This chapter contains the autogenerated documentation of the structures +which are used in the Reed-Solomon Library and are relevant for a +developer. + +.. kernel-doc:: include/linux/rslib.h + :internal: + +Public Functions Provided +========================= + +This chapter contains the autogenerated documentation of the +Reed-Solomon functions which are exported. + +.. kernel-doc:: lib/reed_solomon/reed_solomon.c + :export: + +Credits +======= + +The library code for encoding and decoding was written by Phil Karn. + +:: + + Copyright 2002, Phil Karn, KA9Q + May be used under the terms of the GNU General Public License (GPL) + + +The wrapper functions and interfaces are written by Thomas Gleixner. + +Many users have provided bugfixes, improvements and helping hands for +testing. Thanks a lot. + +The following people have contributed to this document: + +Thomas Gleixner\ tglx@linutronix.de diff --git a/Documentation/crypto/conf.py b/Documentation/crypto/conf.py new file mode 100644 index 000000000000..4335d251ddf3 --- /dev/null +++ b/Documentation/crypto/conf.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8; mode: python -*- + +project = 'Linux Kernel Crypto API' + +tags.add("subproject") + +latex_documents = [ + ('index', 'crypto-api.tex', 'Linux Kernel Crypto API manual', + 'The kernel development community', 'manual'), +] diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst index 07d881147ef3..4ac991dbddb7 100644 --- a/Documentation/dev-tools/index.rst +++ b/Documentation/dev-tools/index.rst @@ -23,6 +23,7 @@ whole; patches welcome! kmemleak kmemcheck gdb-kernel-debugging + kgdb .. only:: subproject and html diff --git a/Documentation/dev-tools/kgdb.rst b/Documentation/dev-tools/kgdb.rst new file mode 100644 index 000000000000..75273203a35a --- /dev/null +++ b/Documentation/dev-tools/kgdb.rst @@ -0,0 +1,907 @@ +================================================= +Using kgdb, kdb and the kernel debugger internals +================================================= + +:Author: Jason Wessel + +Introduction +============ + +The kernel has two different debugger front ends (kdb and kgdb) which +interface to the debug core. It is possible to use either of the +debugger front ends and dynamically transition between them if you +configure the kernel properly at compile and runtime. + +Kdb is simplistic shell-style interface which you can use on a system +console with a keyboard or serial console. You can use it to inspect +memory, registers, process lists, dmesg, and even set breakpoints to +stop in a certain location. Kdb is not a source level debugger, although +you can set breakpoints and execute some basic kernel run control. Kdb +is mainly aimed at doing some analysis to aid in development or +diagnosing kernel problems. You can access some symbols by name in +kernel built-ins or in kernel modules if the code was built with +``CONFIG_KALLSYMS``. + +Kgdb is intended to be used as a source level debugger for the Linux +kernel. It is used along with gdb to debug a Linux kernel. The +expectation is that gdb can be used to "break in" to the kernel to +inspect memory, variables and look through call stack information +similar to the way an application developer would use gdb to debug an +application. It is possible to place breakpoints in kernel code and +perform some limited execution stepping. + +Two machines are required for using kgdb. One of these machines is a +development machine and the other is the target machine. The kernel to +be debugged runs on the target machine. The development machine runs an +instance of gdb against the vmlinux file which contains the symbols (not +a boot image such as bzImage, zImage, uImage...). In gdb the developer +specifies the connection parameters and connects to kgdb. The type of +connection a developer makes with gdb depends on the availability of +kgdb I/O modules compiled as built-ins or loadable kernel modules in the +test machine's kernel. + +Compiling a kernel +================== + +- In order to enable compilation of kdb, you must first enable kgdb. + +- The kgdb test compile options are described in the kgdb test suite + chapter. + +Kernel config options for kgdb +------------------------------ + +To enable ``CONFIG_KGDB`` you should look under +:menuselection:`Kernel hacking --> Kernel debugging` and select +:menuselection:`KGDB: kernel debugger`. + +While it is not a hard requirement that you have symbols in your vmlinux +file, gdb tends not to be very useful without the symbolic data, so you +will want to turn on ``CONFIG_DEBUG_INFO`` which is called +:menuselection:`Compile the kernel with debug info` in the config menu. + +It is advised, but not required, that you turn on the +``CONFIG_FRAME_POINTER`` kernel option which is called :menuselection:`Compile +the kernel with frame pointers` in the config menu. This option inserts code +to into the compiled executable which saves the frame information in +registers or on the stack at different points which allows a debugger +such as gdb to more accurately construct stack back traces while +debugging the kernel. + +If the architecture that you are using supports the kernel option +``CONFIG_STRICT_KERNEL_RWX``, you should consider turning it off. This +option will prevent the use of software breakpoints because it marks +certain regions of the kernel's memory space as read-only. If kgdb +supports it for the architecture you are using, you can use hardware +breakpoints if you desire to run with the ``CONFIG_STRICT_KERNEL_RWX`` +option turned on, else you need to turn off this option. + +Next you should choose one of more I/O drivers to interconnect debugging +host and debugged target. Early boot debugging requires a KGDB I/O +driver that supports early debugging and the driver must be built into +the kernel directly. Kgdb I/O driver configuration takes place via +kernel or module parameters which you can learn more about in the in the +section that describes the parameter kgdboc. + +Here is an example set of ``.config`` symbols to enable or disable for kgdb:: + + # CONFIG_STRICT_KERNEL_RWX is not set + CONFIG_FRAME_POINTER=y + CONFIG_KGDB=y + CONFIG_KGDB_SERIAL_CONSOLE=y + +Kernel config options for kdb +----------------------------- + +Kdb is quite a bit more complex than the simple gdbstub sitting on top +of the kernel's debug core. Kdb must implement a shell, and also adds +some helper functions in other parts of the kernel, responsible for +printing out interesting data such as what you would see if you ran +``lsmod``, or ``ps``. In order to build kdb into the kernel you follow the +same steps as you would for kgdb. + +The main config option for kdb is ``CONFIG_KGDB_KDB`` which is called +:menuselection:`KGDB_KDB: include kdb frontend for kgdb` in the config menu. +In theory you would have already also selected an I/O driver such as the +``CONFIG_KGDB_SERIAL_CONSOLE`` interface if you plan on using kdb on a +serial port, when you were configuring kgdb. + +If you want to use a PS/2-style keyboard with kdb, you would select +``CONFIG_KDB_KEYBOARD`` which is called :menuselection:`KGDB_KDB: keyboard as +input device` in the config menu. The ``CONFIG_KDB_KEYBOARD`` option is not +used for anything in the gdb interface to kgdb. The ``CONFIG_KDB_KEYBOARD`` +option only works with kdb. + +Here is an example set of ``.config`` symbols to enable/disable kdb:: + + # CONFIG_STRICT_KERNEL_RWX is not set + CONFIG_FRAME_POINTER=y + CONFIG_KGDB=y + CONFIG_KGDB_SERIAL_CONSOLE=y + CONFIG_KGDB_KDB=y + CONFIG_KDB_KEYBOARD=y + +Kernel Debugger Boot Arguments +============================== + +This section describes the various runtime kernel parameters that affect +the configuration of the kernel debugger. The following chapter covers +using kdb and kgdb as well as providing some examples of the +configuration parameters. + +Kernel parameter: kgdboc +------------------------ + +The kgdboc driver was originally an abbreviation meant to stand for +"kgdb over console". Today it is the primary mechanism to configure how +to communicate from gdb to kgdb as well as the devices you want to use +to interact with the kdb shell. + +For kgdb/gdb, kgdboc is designed to work with a single serial port. It +is intended to cover the circumstance where you want to use a serial +console as your primary console as well as using it to perform kernel +debugging. It is also possible to use kgdb on a serial port which is not +designated as a system console. Kgdboc may be configured as a kernel +built-in or a kernel loadable module. You can only make use of +``kgdbwait`` and early debugging if you build kgdboc into the kernel as +a built-in. + +Optionally you can elect to activate kms (Kernel Mode Setting) +integration. When you use kms with kgdboc and you have a video driver +that has atomic mode setting hooks, it is possible to enter the debugger +on the graphics console. When the kernel execution is resumed, the +previous graphics mode will be restored. This integration can serve as a +useful tool to aid in diagnosing crashes or doing analysis of memory +with kdb while allowing the full graphics console applications to run. + +kgdboc arguments +~~~~~~~~~~~~~~~~ + +Usage:: + + kgdboc=[kms][[,]kbd][[,]serial_device][,baud] + +The order listed above must be observed if you use any of the optional +configurations together. + +Abbreviations: + +- kms = Kernel Mode Setting + +- kbd = Keyboard + +You can configure kgdboc to use the keyboard, and/or a serial device +depending on if you are using kdb and/or kgdb, in one of the following +scenarios. The order listed above must be observed if you use any of the +optional configurations together. Using kms + only gdb is generally not +a useful combination. + +Using loadable module or built-in +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. As a kernel built-in: + + Use the kernel boot argument:: + + kgdboc=,[baud] + +2. As a kernel loadable module: + + Use the command:: + + modprobe kgdboc kgdboc=,[baud] + + Here are two examples of how you might format the kgdboc string. The + first is for an x86 target using the first serial port. The second + example is for the ARM Versatile AB using the second serial port. + + 1. ``kgdboc=ttyS0,115200`` + + 2. ``kgdboc=ttyAMA1,115200`` + +Configure kgdboc at runtime with sysfs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +At run time you can enable or disable kgdboc by echoing a parameters +into the sysfs. Here are two examples: + +1. Enable kgdboc on ttyS0:: + + echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc + +2. Disable kgdboc:: + + echo "" > /sys/module/kgdboc/parameters/kgdboc + +.. note:: + + You do not need to specify the baud if you are configuring the + console on tty which is already configured or open. + +More examples +^^^^^^^^^^^^^ + +You can configure kgdboc to use the keyboard, and/or a serial device +depending on if you are using kdb and/or kgdb, in one of the following +scenarios. + +1. kdb and kgdb over only a serial port:: + + kgdboc=[,baud] + + Example:: + + kgdboc=ttyS0,115200 + +2. kdb and kgdb with keyboard and a serial port:: + + kgdboc=kbd,[,baud] + + Example:: + + kgdboc=kbd,ttyS0,115200 + +3. kdb with a keyboard:: + + kgdboc=kbd + +4. kdb with kernel mode setting:: + + kgdboc=kms,kbd + +5. kdb with kernel mode setting and kgdb over a serial port:: + + kgdboc=kms,kbd,ttyS0,115200 + +.. note:: + + Kgdboc does not support interrupting the target via the gdb remote + protocol. You must manually send a :kbd:`SysRq-G` unless you have a proxy + that splits console output to a terminal program. A console proxy has a + separate TCP port for the debugger and a separate TCP port for the + "human" console. The proxy can take care of sending the :kbd:`SysRq-G` + for you. + +When using kgdboc with no debugger proxy, you can end up connecting the +debugger at one of two entry points. If an exception occurs after you +have loaded kgdboc, a message should print on the console stating it is +waiting for the debugger. In this case you disconnect your terminal +program and then connect the debugger in its place. If you want to +interrupt the target system and forcibly enter a debug session you have +to issue a :kbd:`Sysrq` sequence and then type the letter :kbd:`g`. Then you +disconnect the terminal session and connect gdb. Your options if you +don't like this are to hack gdb to send the :kbd:`SysRq-G` for you as well as +on the initial connect, or to use a debugger proxy that allows an +unmodified gdb to do the debugging. + +Kernel parameter: ``kgdbwait`` +------------------------------ + +The Kernel command line option ``kgdbwait`` makes kgdb wait for a +debugger connection during booting of a kernel. You can only use this +option if you compiled a kgdb I/O driver into the kernel and you +specified the I/O driver configuration as a kernel command line option. +The kgdbwait parameter should always follow the configuration parameter +for the kgdb I/O driver in the kernel command line else the I/O driver +will not be configured prior to asking the kernel to use it to wait. + +The kernel will stop and wait as early as the I/O driver and +architecture allows when you use this option. If you build the kgdb I/O +driver as a loadable kernel module kgdbwait will not do anything. + +Kernel parameter: ``kgdbcon`` +----------------------------- + +The ``kgdbcon`` feature allows you to see :c:func:`printk` messages inside gdb +while gdb is connected to the kernel. Kdb does not make use of the kgdbcon +feature. + +Kgdb supports using the gdb serial protocol to send console messages to +the debugger when the debugger is connected and running. There are two +ways to activate this feature. + +1. Activate with the kernel command line option:: + + kgdbcon + +2. Use sysfs before configuring an I/O driver:: + + echo 1 > /sys/module/kgdb/parameters/kgdb_use_con + +.. note:: + + If you do this after you configure the kgdb I/O driver, the + setting will not take effect until the next point the I/O is + reconfigured. + +.. important:: + + You cannot use kgdboc + kgdbcon on a tty that is an + active system console. An example of incorrect usage is:: + + console=ttyS0,115200 kgdboc=ttyS0 kgdbcon + +It is possible to use this option with kgdboc on a tty that is not a +system console. + +Run time parameter: ``kgdbreboot`` +---------------------------------- + +The kgdbreboot feature allows you to change how the debugger deals with +the reboot notification. You have 3 choices for the behavior. The +default behavior is always set to 0. + +.. tabularcolumns:: |p{0.4cm}|p{11.5cm}|p{5.6cm}| + +.. flat-table:: + :widths: 1 10 8 + + * - 1 + - ``echo -1 > /sys/module/debug_core/parameters/kgdbreboot`` + - Ignore the reboot notification entirely. + + * - 2 + - ``echo 0 > /sys/module/debug_core/parameters/kgdbreboot`` + - Send the detach message to any attached debugger client. + + * - 3 + - ``echo 1 > /sys/module/debug_core/parameters/kgdbreboot`` + - Enter the debugger on reboot notify. + +Using kdb +========= + +Quick start for kdb on a serial port +------------------------------------ + +This is a quick example of how to use kdb. + +1. Configure kgdboc at boot using kernel parameters:: + + console=ttyS0,115200 kgdboc=ttyS0,115200 + + OR + + Configure kgdboc after the kernel has booted; assuming you are using + a serial port console:: + + echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc + +2. Enter the kernel debugger manually or by waiting for an oops or + fault. There are several ways you can enter the kernel debugger + manually; all involve using the :kbd:`SysRq-G`, which means you must have + enabled ``CONFIG_MAGIC_SysRq=y`` in your kernel config. + + - When logged in as root or with a super user session you can run:: + + echo g > /proc/sysrq-trigger + + - Example using minicom 2.2 + + Press: :kbd:`CTRL-A` :kbd:`f` :kbd:`g` + + - When you have telneted to a terminal server that supports sending + a remote break + + Press: :kbd:`CTRL-]` + + Type in: ``send break`` + + Press: :kbd:`Enter` :kbd:`g` + +3. From the kdb prompt you can run the ``help`` command to see a complete + list of the commands that are available. + + Some useful commands in kdb include: + + =========== ================================================================= + ``lsmod`` Shows where kernel modules are loaded + ``ps`` Displays only the active processes + ``ps A`` Shows all the processes + ``summary`` Shows kernel version info and memory usage + ``bt`` Get a backtrace of the current process using :c:func:`dump_stack` + ``dmesg`` View the kernel syslog buffer + ``go`` Continue the system + =========== ================================================================= + +4. When you are done using kdb you need to consider rebooting the system + or using the ``go`` command to resuming normal kernel execution. If you + have paused the kernel for a lengthy period of time, applications + that rely on timely networking or anything to do with real wall clock + time could be adversely affected, so you should take this into + consideration when using the kernel debugger. + +Quick start for kdb using a keyboard connected console +------------------------------------------------------ + +This is a quick example of how to use kdb with a keyboard. + +1. Configure kgdboc at boot using kernel parameters:: + + kgdboc=kbd + + OR + + Configure kgdboc after the kernel has booted:: + + echo kbd > /sys/module/kgdboc/parameters/kgdboc + +2. Enter the kernel debugger manually or by waiting for an oops or + fault. There are several ways you can enter the kernel debugger + manually; all involve using the :kbd:`SysRq-G`, which means you must have + enabled ``CONFIG_MAGIC_SysRq=y`` in your kernel config. + + - When logged in as root or with a super user session you can run:: + + echo g > /proc/sysrq-trigger + + - Example using a laptop keyboard: + + Press and hold down: :kbd:`Alt` + + Press and hold down: :kbd:`Fn` + + Press and release the key with the label: :kbd:`SysRq` + + Release: :kbd:`Fn` + + Press and release: :kbd:`g` + + Release: :kbd:`Alt` + + - Example using a PS/2 101-key keyboard + + Press and hold down: :kbd:`Alt` + + Press and release the key with the label: :kbd:`SysRq` + + Press and release: :kbd:`g` + + Release: :kbd:`Alt` + +3. Now type in a kdb command such as ``help``, ``dmesg``, ``bt`` or ``go`` to + continue kernel execution. + +Using kgdb / gdb +================ + +In order to use kgdb you must activate it by passing configuration +information to one of the kgdb I/O drivers. If you do not pass any +configuration information kgdb will not do anything at all. Kgdb will +only actively hook up to the kernel trap hooks if a kgdb I/O driver is +loaded and configured. If you unconfigure a kgdb I/O driver, kgdb will +unregister all the kernel hook points. + +All kgdb I/O drivers can be reconfigured at run time, if +``CONFIG_SYSFS`` and ``CONFIG_MODULES`` are enabled, by echo'ing a new +config string to ``/sys/module//parameter/