diff options
author | Timothy Pearson <tpearson@raptorengineering.com> | 2017-08-23 14:45:25 -0500 |
---|---|---|
committer | Timothy Pearson <tpearson@raptorengineering.com> | 2017-08-23 14:45:25 -0500 |
commit | fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204 (patch) | |
tree | 22962a4387943edc841c72a4e636a068c66d58fd /Documentation/DocBook | |
download | ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.zip ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.tar.gz |
Initial import of modified Linux 2.6.28 tree
Original upstream URL:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git | branch linux-2.6.28.y
Diffstat (limited to 'Documentation/DocBook')
30 files changed, 15492 insertions, 0 deletions
diff --git a/Documentation/DocBook/.gitignore b/Documentation/DocBook/.gitignore new file mode 100644 index 0000000..c102c02 --- /dev/null +++ b/Documentation/DocBook/.gitignore @@ -0,0 +1,6 @@ +*.xml +*.ps +*.pdf +*.html +*.9.gz +*.9 diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile new file mode 100644 index 0000000..9b1f6ca --- /dev/null +++ b/Documentation/DocBook/Makefile @@ -0,0 +1,240 @@ +### +# This makefile is used to generate the kernel documentation, +# primarily based on in-line comments in various source files. +# See Documentation/kernel-doc-nano-HOWTO.txt for instruction in how +# to document the SRC - and how to read it. +# To add a new book the only step required is to add the book to the +# list of DOCBOOKS. + +DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml \ + kernel-hacking.xml kernel-locking.xml deviceiobook.xml \ + procfs-guide.xml writing_usb_driver.xml networking.xml \ + kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \ + gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ + genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ + mac80211.xml debugobjects.xml sh.xml + +### +# The build process is as follows (targets): +# (xmldocs) [by docproc] +# file.tmpl --> file.xml +--> file.ps (psdocs) [by db2ps or xmlto] +# +--> file.pdf (pdfdocs) [by db2pdf or xmlto] +# +--> DIR=file (htmldocs) [by xmlto] +# +--> man/ (mandocs) [by xmlto] + + +# for PDF and PS output you can choose between xmlto and docbook-utils tools +PDF_METHOD = $(prefer-db2x) +PS_METHOD = $(prefer-db2x) + + +### +# The targets that may be used. +PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs + +BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) +xmldocs: $(BOOKS) +sgmldocs: xmldocs + +PS := $(patsubst %.xml, %.ps, $(BOOKS)) +psdocs: $(PS) + +PDF := $(patsubst %.xml, %.pdf, $(BOOKS)) +pdfdocs: $(PDF) + +HTML := $(sort $(patsubst %.xml, %.html, $(BOOKS))) +htmldocs: $(HTML) + $(call build_main_index) + +MAN := $(patsubst %.xml, %.9, $(BOOKS)) +mandocs: $(MAN) + +installmandocs: mandocs + mkdir -p /usr/local/man/man9/ + install Documentation/DocBook/man/*.9.gz /usr/local/man/man9/ + +### +#External programs used +KERNELDOC = $(srctree)/scripts/kernel-doc +DOCPROC = $(objtree)/scripts/basic/docproc + +XMLTOFLAGS = -m $(srctree)/Documentation/DocBook/stylesheet.xsl +#XMLTOFLAGS += --skip-validation + +### +# DOCPROC is used for two purposes: +# 1) To generate a dependency list for a .tmpl file +# 2) To preprocess a .tmpl file and call kernel-doc with +# appropriate parameters. +# The following rules are used to generate the .xml documentation +# required to generate the final targets. (ps, pdf, html). +quiet_cmd_docproc = DOCPROC $@ + cmd_docproc = SRCTREE=$(srctree)/ $(DOCPROC) doc $< >$@ +define rule_docproc + set -e; \ + $(if $($(quiet)cmd_$(1)),echo ' $($(quiet)cmd_$(1))';) \ + $(cmd_$(1)); \ + ( \ + echo 'cmd_$@ := $(cmd_$(1))'; \ + echo $@: `SRCTREE=$(srctree) $(DOCPROC) depend $<`; \ + ) > $(dir $@).$(notdir $@).cmd +endef + +%.xml: %.tmpl FORCE + $(call if_changed_rule,docproc) + +### +#Read in all saved dependency files +cmd_files := $(wildcard $(foreach f,$(BOOKS),$(dir $(f)).$(notdir $(f)).cmd)) + +ifneq ($(cmd_files),) + include $(cmd_files) +endif + +### +# Changes in kernel-doc force a rebuild of all documentation +$(BOOKS): $(KERNELDOC) + +### +# procfs guide uses a .c file as example code. +# This requires an explicit dependency +C-procfs-example = procfs_example.xml +C-procfs-example2 = $(addprefix $(obj)/,$(C-procfs-example)) +$(obj)/procfs-guide.xml: $(C-procfs-example2) + +# List of programs to build +##oops, this is a kernel module::hostprogs-y := procfs_example +obj-m += procfs_example.o + +# Tell kbuild to always build the programs +always := $(hostprogs-y) + +notfoundtemplate = echo "*** You have to install docbook-utils or xmlto ***"; \ + exit 1 +db2xtemplate = db2TYPE -o $(dir $@) $< +xmltotemplate = xmlto TYPE $(XMLTOFLAGS) -o $(dir $@) $< + +# determine which methods are available +ifeq ($(shell which db2ps >/dev/null 2>&1 && echo found),found) + use-db2x = db2x + prefer-db2x = db2x +else + use-db2x = notfound + prefer-db2x = $(use-xmlto) +endif +ifeq ($(shell which xmlto >/dev/null 2>&1 && echo found),found) + use-xmlto = xmlto + prefer-xmlto = xmlto +else + use-xmlto = notfound + prefer-xmlto = $(use-db2x) +endif + +# the commands, generated from the chosen template +quiet_cmd_db2ps = PS $@ + cmd_db2ps = $(subst TYPE,ps, $($(PS_METHOD)template)) +%.ps : %.xml + $(call cmd,db2ps) + +quiet_cmd_db2pdf = PDF $@ + cmd_db2pdf = $(subst TYPE,pdf, $($(PDF_METHOD)template)) +%.pdf : %.xml + $(call cmd,db2pdf) + + +main_idx = Documentation/DocBook/index.html +build_main_index = rm -rf $(main_idx) && \ + echo '<h1>Linux Kernel HTML Documentation</h1>' >> $(main_idx) && \ + echo '<h2>Kernel Version: $(KERNELVERSION)</h2>' >> $(main_idx) && \ + cat $(HTML) >> $(main_idx) + +quiet_cmd_db2html = HTML $@ + cmd_db2html = xmlto xhtml $(XMLTOFLAGS) -o $(patsubst %.html,%,$@) $< && \ + echo '<a HREF="$(patsubst %.html,%,$(notdir $@))/index.html"> \ + $(patsubst %.html,%,$(notdir $@))</a><p>' > $@ + +%.html: %.xml + @(which xmlto > /dev/null 2>&1) || \ + (echo "*** You need to install xmlto ***"; \ + exit 1) + @rm -rf $@ $(patsubst %.html,%,$@) + $(call cmd,db2html) + @if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \ + cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi + +quiet_cmd_db2man = MAN $@ + cmd_db2man = if grep -q refentry $<; then xmlto man $(XMLTOFLAGS) -o $(obj)/man $< ; gzip -f $(obj)/man/*.9; fi +%.9 : %.xml + @(which xmlto > /dev/null 2>&1) || \ + (echo "*** You need to install xmlto ***"; \ + exit 1) + $(Q)mkdir -p $(obj)/man + $(call cmd,db2man) + @touch $@ + +### +# Rules to generate postscripts and PNG images from .fig format files +quiet_cmd_fig2eps = FIG2EPS $@ + cmd_fig2eps = fig2dev -Leps $< $@ + +%.eps: %.fig + @(which fig2dev > /dev/null 2>&1) || \ + (echo "*** You need to install transfig ***"; \ + exit 1) + $(call cmd,fig2eps) + +quiet_cmd_fig2png = FIG2PNG $@ + cmd_fig2png = fig2dev -Lpng $< $@ + +%.png: %.fig + @(which fig2dev > /dev/null 2>&1) || \ + (echo "*** You need to install transfig ***"; \ + exit 1) + $(call cmd,fig2png) + +### +# Rule to convert a .c file to inline XML documentation + gen_xml = : + quiet_gen_xml = echo ' GEN $@' +silent_gen_xml = : +%.xml: %.c + @$($(quiet)gen_xml) + @( \ + echo "<programlisting>"; \ + expand --tabs=8 < $< | \ + sed -e "s/&/\\&/g" \ + -e "s/</\\</g" \ + -e "s/>/\\>/g"; \ + echo "</programlisting>") > $@ + +### +# Help targets as used by the top-level makefile +dochelp: + @echo ' Linux kernel internal documentation in different formats:' + @echo ' htmldocs - HTML' + @echo ' installmandocs - install man pages generated by mandocs' + @echo ' mandocs - man pages' + @echo ' pdfdocs - PDF' + @echo ' psdocs - Postscript' + @echo ' xmldocs - XML DocBook' + +### +# Temporary files left by various tools +clean-files := $(DOCBOOKS) \ + $(patsubst %.xml, %.dvi, $(DOCBOOKS)) \ + $(patsubst %.xml, %.aux, $(DOCBOOKS)) \ + $(patsubst %.xml, %.tex, $(DOCBOOKS)) \ + $(patsubst %.xml, %.log, $(DOCBOOKS)) \ + $(patsubst %.xml, %.out, $(DOCBOOKS)) \ + $(patsubst %.xml, %.ps, $(DOCBOOKS)) \ + $(patsubst %.xml, %.pdf, $(DOCBOOKS)) \ + $(patsubst %.xml, %.html, $(DOCBOOKS)) \ + $(patsubst %.xml, %.9, $(DOCBOOKS)) \ + $(C-procfs-example) + +clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man + +# Declare the contents of the .PHONY variable as phony. We keep that +# information in a variable se we can use it in if_changed and friends. + +.PHONY: $(PHONY) diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl new file mode 100644 index 0000000..7f5f218 --- /dev/null +++ b/Documentation/DocBook/debugobjects.tmpl @@ -0,0 +1,391 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="debug-objects-guide"> + <bookinfo> + <title>Debug objects life time</title> + + <authorgroup> + <author> + <firstname>Thomas</firstname> + <surname>Gleixner</surname> + <affiliation> + <address> + <email>tglx@linutronix.de</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2008</year> + <holder>Thomas Gleixner</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License version 2 as published by the Free Software Foundation. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <para> + debugobjects is a generic infrastructure to track the life time + of kernel objects and validate the operations on those. + </para> + <para> + debugobjects is useful to check for the following error patterns: + <itemizedlist> + <listitem><para>Activation of uninitialized objects</para></listitem> + <listitem><para>Initialization of active objects</para></listitem> + <listitem><para>Usage of freed/destroyed objects</para></listitem> + </itemizedlist> + </para> + <para> + debugobjects is not changing the data structure of the real + object so it can be compiled in with a minimal runtime impact + and enabled on demand with a kernel command line option. + </para> + </chapter> + + <chapter id="howto"> + <title>Howto use debugobjects</title> + <para> + A kernel subsystem needs to provide a data structure which + describes the object type and add calls into the debug code at + appropriate places. The data structure to describe the object + type needs at minimum the name of the object type. Optional + functions can and should be provided to fixup detected problems + so the kernel can continue to work and the debug information can + be retrieved from a live system instead of hard core debugging + with serial consoles and stack trace transcripts from the + monitor. + </para> + <para> + The debug calls provided by debugobjects are: + <itemizedlist> + <listitem><para>debug_object_init</para></listitem> + <listitem><para>debug_object_init_on_stack</para></listitem> + <listitem><para>debug_object_activate</para></listitem> + <listitem><para>debug_object_deactivate</para></listitem> + <listitem><para>debug_object_destroy</para></listitem> + <listitem><para>debug_object_free</para></listitem> + </itemizedlist> + Each of these functions takes the address of the real object and + a pointer to the object type specific debug description + structure. + </para> + <para> + Each detected error is reported in the statistics and a limited + number of errors are printk'ed including a full stack trace. + </para> + <para> + The statistics are available via debugfs/debug_objects/stats. + They provide information about the number of warnings and the + number of successful fixups along with information about the + usage of the internal tracking objects and the state of the + internal tracking objects pool. + </para> + </chapter> + <chapter id="debugfunctions"> + <title>Debug functions</title> + <sect1 id="prototypes"> + <title>Debug object function reference</title> +!Elib/debugobjects.c + </sect1> + <sect1 id="debug_object_init"> + <title>debug_object_init</title> + <para> + This function is called whenever the initialization function + of a real object is called. + </para> + <para> + When the real object is already tracked by debugobjects it is + checked, whether the object can be initialized. Initializing + is not allowed for active and destroyed objects. When + debugobjects detects an error, then it calls the fixup_init + function of the object type description structure if provided + by the caller. The fixup function can correct the problem + before the real initialization of the object happens. E.g. it + can deactivate an active object in order to prevent damage to + the subsystem. + </para> + <para> + When the real object is not yet tracked by debugobjects, + debugobjects allocates a tracker object for the real object + and sets the tracker object state to ODEBUG_STATE_INIT. It + verifies that the object is not on the callers stack. If it is + on the callers stack then a limited number of warnings + including a full stack trace is printk'ed. The calling code + must use debug_object_init_on_stack() and remove the object + before leaving the function which allocated it. See next + section. + </para> + </sect1> + + <sect1 id="debug_object_init_on_stack"> + <title>debug_object_init_on_stack</title> + <para> + This function is called whenever the initialization function + of a real object which resides on the stack is called. + </para> + <para> + When the real object is already tracked by debugobjects it is + checked, whether the object can be initialized. Initializing + is not allowed for active and destroyed objects. When + debugobjects detects an error, then it calls the fixup_init + function of the object type description structure if provided + by the caller. The fixup function can correct the problem + before the real initialization of the object happens. E.g. it + can deactivate an active object in order to prevent damage to + the subsystem. + </para> + <para> + When the real object is not yet tracked by debugobjects + debugobjects allocates a tracker object for the real object + and sets the tracker object state to ODEBUG_STATE_INIT. It + verifies that the object is on the callers stack. + </para> + <para> + An object which is on the stack must be removed from the + tracker by calling debug_object_free() before the function + which allocates the object returns. Otherwise we keep track of + stale objects. + </para> + </sect1> + + <sect1 id="debug_object_activate"> + <title>debug_object_activate</title> + <para> + This function is called whenever the activation function of a + real object is called. + </para> + <para> + When the real object is already tracked by debugobjects it is + checked, whether the object can be activated. Activating is + not allowed for active and destroyed objects. When + debugobjects detects an error, then it calls the + fixup_activate function of the object type description + structure if provided by the caller. The fixup function can + correct the problem before the real activation of the object + happens. E.g. it can deactivate an active object in order to + prevent damage to the subsystem. + </para> + <para> + When the real object is not yet tracked by debugobjects then + the fixup_activate function is called if available. This is + necessary to allow the legitimate activation of statically + allocated and initialized objects. The fixup function checks + whether the object is valid and calls the debug_objects_init() + function to initialize the tracking of this object. + </para> + <para> + When the activation is legitimate, then the state of the + associated tracker object is set to ODEBUG_STATE_ACTIVE. + </para> + </sect1> + + <sect1 id="debug_object_deactivate"> + <title>debug_object_deactivate</title> + <para> + This function is called whenever the deactivation function of + a real object is called. + </para> + <para> + When the real object is tracked by debugobjects it is checked, + whether the object can be deactivated. Deactivating is not + allowed for untracked or destroyed objects. + </para> + <para> + When the deactivation is legitimate, then the state of the + associated tracker object is set to ODEBUG_STATE_INACTIVE. + </para> + </sect1> + + <sect1 id="debug_object_destroy"> + <title>debug_object_destroy</title> + <para> + This function is called to mark an object destroyed. This is + useful to prevent the usage of invalid objects, which are + still available in memory: either statically allocated objects + or objects which are freed later. + </para> + <para> + When the real object is tracked by debugobjects it is checked, + whether the object can be destroyed. Destruction is not + allowed for active and destroyed objects. When debugobjects + detects an error, then it calls the fixup_destroy function of + the object type description structure if provided by the + caller. The fixup function can correct the problem before the + real destruction of the object happens. E.g. it can deactivate + an active object in order to prevent damage to the subsystem. + </para> + <para> + When the destruction is legitimate, then the state of the + associated tracker object is set to ODEBUG_STATE_DESTROYED. + </para> + </sect1> + + <sect1 id="debug_object_free"> + <title>debug_object_free</title> + <para> + This function is called before an object is freed. + </para> + <para> + When the real object is tracked by debugobjects it is checked, + whether the object can be freed. Free is not allowed for + active objects. When debugobjects detects an error, then it + calls the fixup_free function of the object type description + structure if provided by the caller. The fixup function can + correct the problem before the real free of the object + happens. E.g. it can deactivate an active object in order to + prevent damage to the subsystem. + </para> + <para> + Note that debug_object_free removes the object from the + tracker. Later usage of the object is detected by the other + debug checks. + </para> + </sect1> + </chapter> + <chapter id="fixupfunctions"> + <title>Fixup functions</title> + <sect1 id="debug_obj_descr"> + <title>Debug object type description structure</title> +!Iinclude/linux/debugobjects.h + </sect1> + <sect1 id="fixup_init"> + <title>fixup_init</title> + <para> + This function is called from the debug code whenever a problem + in debug_object_init is detected. The function takes the + address of the object and the state which is currently + recorded in the tracker. + </para> + <para> + Called from debug_object_init when the object state is: + <itemizedlist> + <listitem><para>ODEBUG_STATE_ACTIVE</para></listitem> + </itemizedlist> + </para> + <para> + The function returns 1 when the fixup was successful, + otherwise 0. The return value is used to update the + statistics. + </para> + <para> + Note, that the function needs to call the debug_object_init() + function again, after the damage has been repaired in order to + keep the state consistent. + </para> + </sect1> + + <sect1 id="fixup_activate"> + <title>fixup_activate</title> + <para> + This function is called from the debug code whenever a problem + in debug_object_activate is detected. + </para> + <para> + Called from debug_object_activate when the object state is: + <itemizedlist> + <listitem><para>ODEBUG_STATE_NOTAVAILABLE</para></listitem> + <listitem><para>ODEBUG_STATE_ACTIVE</para></listitem> + </itemizedlist> + </para> + <para> + The function returns 1 when the fixup was successful, + otherwise 0. The return value is used to update the + statistics. + </para> + <para> + Note that the function needs to call the debug_object_activate() + function again after the damage has been repaired in order to + keep the state consistent. + </para> + <para> + The activation of statically initialized objects is a special + case. When debug_object_activate() has no tracked object for + this object address then fixup_activate() is called with + object state ODEBUG_STATE_NOTAVAILABLE. The fixup function + needs to check whether this is a legitimate case of a + statically initialized object or not. In case it is it calls + debug_object_init() and debug_object_activate() to make the + object known to the tracker and marked active. In this case + the function should return 0 because this is not a real fixup. + </para> + </sect1> + + <sect1 id="fixup_destroy"> + <title>fixup_destroy</title> + <para> + This function is called from the debug code whenever a problem + in debug_object_destroy is detected. + </para> + <para> + Called from debug_object_destroy when the object state is: + <itemizedlist> + <listitem><para>ODEBUG_STATE_ACTIVE</para></listitem> + </itemizedlist> + </para> + <para> + The function returns 1 when the fixup was successful, + otherwise 0. The return value is used to update the + statistics. + </para> + </sect1> + <sect1 id="fixup_free"> + <title>fixup_free</title> + <para> + This function is called from the debug code whenever a problem + in debug_object_free is detected. Further it can be called + from the debug checks in kfree/vfree, when an active object is + detected from the debug_check_no_obj_freed() sanity checks. + </para> + <para> + Called from debug_object_free() or debug_check_no_obj_freed() + when the object state is: + <itemizedlist> + <listitem><para>ODEBUG_STATE_ACTIVE</para></listitem> + </itemizedlist> + </para> + <para> + The function returns 1 when the fixup was successful, + otherwise 0. The return value is used to update the + statistics. + </para> + </sect1> + </chapter> + <chapter id="bugs"> + <title>Known Bugs And Assumptions</title> + <para> + None (knock on wood). + </para> + </chapter> +</book> diff --git a/Documentation/DocBook/deviceiobook.tmpl b/Documentation/DocBook/deviceiobook.tmpl new file mode 100644 index 0000000..3ed8812 --- /dev/null +++ b/Documentation/DocBook/deviceiobook.tmpl @@ -0,0 +1,323 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="DoingIO"> + <bookinfo> + <title>Bus-Independent Device Accesses</title> + + <authorgroup> + <author> + <firstname>Matthew</firstname> + <surname>Wilcox</surname> + <affiliation> + <address> + <email>matthew@wil.cx</email> + </address> + </affiliation> + </author> + </authorgroup> + + <authorgroup> + <author> + <firstname>Alan</firstname> + <surname>Cox</surname> + <affiliation> + <address> + <email>alan@lxorguk.ukuu.org.uk</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2001</year> + <holder>Matthew Wilcox</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <para> + Linux provides an API which abstracts performing IO across all busses + and devices, allowing device drivers to be written independently of + bus type. + </para> + </chapter> + + <chapter id="bugs"> + <title>Known Bugs And Assumptions</title> + <para> + None. + </para> + </chapter> + + <chapter id="mmio"> + <title>Memory Mapped IO</title> + <sect1 id="getting_access_to_the_device"> + <title>Getting Access to the Device</title> + <para> + The most widely supported form of IO is memory mapped IO. + That is, a part of the CPU's address space is interpreted + not as accesses to memory, but as accesses to a device. Some + architectures define devices to be at a fixed address, but most + have some method of discovering devices. The PCI bus walk is a + good example of such a scheme. This document does not cover how + to receive such an address, but assumes you are starting with one. + Physical addresses are of type unsigned long. + </para> + + <para> + This address should not be used directly. Instead, to get an + address suitable for passing to the accessor functions described + below, you should call <function>ioremap</function>. + An address suitable for accessing the device will be returned to you. + </para> + + <para> + After you've finished using the device (say, in your module's + exit routine), call <function>iounmap</function> in order to return + the address space to the kernel. Most architectures allocate new + address space each time you call <function>ioremap</function>, and + they can run out unless you call <function>iounmap</function>. + </para> + </sect1> + + <sect1 id="accessing_the_device"> + <title>Accessing the device</title> + <para> + The part of the interface most used by drivers is reading and + writing memory-mapped registers on the device. Linux provides + interfaces to read and write 8-bit, 16-bit, 32-bit and 64-bit + quantities. Due to a historical accident, these are named byte, + word, long and quad accesses. Both read and write accesses are + supported; there is no prefetch support at this time. + </para> + + <para> + The functions are named <function>readb</function>, + <function>readw</function>, <function>readl</function>, + <function>readq</function>, <function>readb_relaxed</function>, + <function>readw_relaxed</function>, <function>readl_relaxed</function>, + <function>readq_relaxed</function>, <function>writeb</function>, + <function>writew</function>, <function>writel</function> and + <function>writeq</function>. + </para> + + <para> + Some devices (such as framebuffers) would like to use larger + transfers than 8 bytes at a time. For these devices, the + <function>memcpy_toio</function>, <function>memcpy_fromio</function> + and <function>memset_io</function> functions are provided. + Do not use memset or memcpy on IO addresses; they + are not guaranteed to copy data in order. + </para> + + <para> + The read and write functions are defined to be ordered. That is the + compiler is not permitted to reorder the I/O sequence. When the + ordering can be compiler optimised, you can use <function> + __readb</function> and friends to indicate the relaxed ordering. Use + this with care. + </para> + + <para> + While the basic functions are defined to be synchronous with respect + to each other and ordered with respect to each other the busses the + devices sit on may themselves have asynchronicity. In particular many + authors are burned by the fact that PCI bus writes are posted + asynchronously. A driver author must issue a read from the same + device to ensure that writes have occurred in the specific cases the + author cares. This kind of property cannot be hidden from driver + writers in the API. In some cases, the read used to flush the device + may be expected to fail (if the card is resetting, for example). In + that case, the read should be done from config space, which is + guaranteed to soft-fail if the card doesn't respond. + </para> + + <para> + The following is an example of flushing a write to a device when + the driver would like to ensure the write's effects are visible prior + to continuing execution. + </para> + +<programlisting> +static inline void +qla1280_disable_intrs(struct scsi_qla_host *ha) +{ + struct device_reg *reg; + + reg = ha->iobase; + /* disable risc and host interrupts */ + WRT_REG_WORD(&reg->ictrl, 0); + /* + * The following read will ensure that the above write + * has been received by the device before we return from this + * function. + */ + RD_REG_WORD(&reg->ictrl); + ha->flags.ints_enabled = 0; +} +</programlisting> + + <para> + In addition to write posting, on some large multiprocessing systems + (e.g. SGI Challenge, Origin and Altix machines) posted writes won't + be strongly ordered coming from different CPUs. Thus it's important + to properly protect parts of your driver that do memory-mapped writes + with locks and use the <function>mmiowb</function> to make sure they + arrive in the order intended. Issuing a regular <function>readX + </function> will also ensure write ordering, but should only be used + when the driver has to be sure that the write has actually arrived + at the device (not that it's simply ordered with respect to other + writes), since a full <function>readX</function> is a relatively + expensive operation. + </para> + + <para> + Generally, one should use <function>mmiowb</function> prior to + releasing a spinlock that protects regions using <function>writeb + </function> or similar functions that aren't surrounded by <function> + readb</function> calls, which will ensure ordering and flushing. The + following pseudocode illustrates what might occur if write ordering + isn't guaranteed via <function>mmiowb</function> or one of the + <function>readX</function> functions. + </para> + +<programlisting> +CPU A: spin_lock_irqsave(&dev_lock, flags) +CPU A: ... +CPU A: writel(newval, ring_ptr); +CPU A: spin_unlock_irqrestore(&dev_lock, flags) + ... +CPU B: spin_lock_irqsave(&dev_lock, flags) +CPU B: writel(newval2, ring_ptr); +CPU B: ... +CPU B: spin_unlock_irqrestore(&dev_lock, flags) +</programlisting> + + <para> + In the case above, newval2 could be written to ring_ptr before + newval. Fixing it is easy though: + </para> + +<programlisting> +CPU A: spin_lock_irqsave(&dev_lock, flags) +CPU A: ... +CPU A: writel(newval, ring_ptr); +CPU A: mmiowb(); /* ensure no other writes beat us to the device */ +CPU A: spin_unlock_irqrestore(&dev_lock, flags) + ... +CPU B: spin_lock_irqsave(&dev_lock, flags) +CPU B: writel(newval2, ring_ptr); +CPU B: ... +CPU B: mmiowb(); +CPU B: spin_unlock_irqrestore(&dev_lock, flags) +</programlisting> + + <para> + See tg3.c for a real world example of how to use <function>mmiowb + </function> + </para> + + <para> + PCI ordering rules also guarantee that PIO read responses arrive + after any outstanding DMA writes from that bus, since for some devices + the result of a <function>readb</function> call may signal to the + driver that a DMA transaction is complete. In many cases, however, + the driver may want to indicate that the next + <function>readb</function> call has no relation to any previous DMA + writes performed by the device. The driver can use + <function>readb_relaxed</function> for these cases, although only + some platforms will honor the relaxed semantics. Using the relaxed + read functions will provide significant performance benefits on + platforms that support it. The qla2xxx driver provides examples + of how to use <function>readX_relaxed</function>. In many cases, + a majority of the driver's <function>readX</function> calls can + safely be converted to <function>readX_relaxed</function> calls, since + only a few will indicate or depend on DMA completion. + </para> + </sect1> + + </chapter> + + <chapter id="port_space_accesses"> + <title>Port Space Accesses</title> + <sect1 id="port_space_explained"> + <title>Port Space Explained</title> + + <para> + Another form of IO commonly supported is Port Space. This is a + range of addresses separate to the normal memory address space. + Access to these addresses is generally not as fast as accesses + to the memory mapped addresses, and it also has a potentially + smaller address space. + </para> + + <para> + Unlike memory mapped IO, no preparation is required + to access port space. + </para> + + </sect1> + <sect1 id="accessing_port_space"> + <title>Accessing Port Space</title> + <para> + Accesses to this space are provided through a set of functions + which allow 8-bit, 16-bit and 32-bit accesses; also + known as byte, word and long. These functions are + <function>inb</function>, <function>inw</function>, + <function>inl</function>, <function>outb</function>, + <function>outw</function> and <function>outl</function>. + </para> + + <para> + Some variants are provided for these functions. Some devices + require that accesses to their ports are slowed down. This + functionality is provided by appending a <function>_p</function> + to the end of the function. There are also equivalents to memcpy. + The <function>ins</function> and <function>outs</function> + functions copy bytes, words or longs to the given port. + </para> + </sect1> + + </chapter> + + <chapter id="pubfunctions"> + <title>Public Functions Provided</title> +!Iarch/x86/include/asm/io_32.h +!Elib/iomap.c + </chapter> + +</book> diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl new file mode 100644 index 0000000..5e87ad5 --- /dev/null +++ b/Documentation/DocBook/filesystems.tmpl @@ -0,0 +1,421 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="Linux-filesystems-API"> + <bookinfo> + <title>Linux Filesystems API</title> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="vfs"> + <title>The Linux VFS</title> + <sect1 id="the_filesystem_types"><title>The Filesystem types</title> +!Iinclude/linux/fs.h + </sect1> + <sect1 id="the_directory_cache"><title>The Directory Cache</title> +!Efs/dcache.c +!Iinclude/linux/dcache.h + </sect1> + <sect1 id="inode_handling"><title>Inode Handling</title> +!Efs/inode.c +!Efs/bad_inode.c + </sect1> + <sect1 id="registration_and_superblocks"><title>Registration and Superblocks</title> +!Efs/super.c + </sect1> + <sect1 id="file_locks"><title>File Locks</title> +!Efs/locks.c +!Ifs/locks.c + </sect1> + <sect1 id="other_functions"><title>Other Functions</title> +!Efs/mpage.c +!Efs/namei.c +!Efs/buffer.c +!Efs/bio.c +!Efs/seq_file.c +!Efs/filesystems.c +!Efs/fs-writeback.c +!Efs/block_dev.c + </sect1> + </chapter> + + <chapter id="proc"> + <title>The proc filesystem</title> + + <sect1 id="sysctl_interface"><title>sysctl interface</title> +!Ekernel/sysctl.c + </sect1> + + <sect1 id="proc_filesystem_interface"><title>proc filesystem interface</title> +!Ifs/proc/base.c + </sect1> + </chapter> + + <chapter id="sysfs"> + <title>The Filesystem for Exporting Kernel Objects</title> +!Efs/sysfs/file.c +!Efs/sysfs/symlink.c +!Efs/sysfs/bin.c + </chapter> + + <chapter id="debugfs"> + <title>The debugfs filesystem</title> + + <sect1 id="debugfs_interface"><title>debugfs interface</title> +!Efs/debugfs/inode.c +!Efs/debugfs/file.c + </sect1> + </chapter> + + <chapter id="LinuxJDBAPI"> + <chapterinfo> + <title>The Linux Journalling API</title> + + <authorgroup> + <author> + <firstname>Roger</firstname> + <surname>Gammans</surname> + <affiliation> + <address> + <email>rgammans@computer-surgery.co.uk</email> + </address> + </affiliation> + </author> + </authorgroup> + + <authorgroup> + <author> + <firstname>Stephen</firstname> + <surname>Tweedie</surname> + <affiliation> + <address> + <email>sct@redhat.com</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2002</year> + <holder>Roger Gammans</holder> + </copyright> + </chapterinfo> + + <title>The Linux Journalling API</title> + + <sect1 id="journaling_overview"> + <title>Overview</title> + <sect2 id="journaling_details"> + <title>Details</title> +<para> +The journalling layer is easy to use. You need to +first of all create a journal_t data structure. There are +two calls to do this dependent on how you decide to allocate the physical +media on which the journal resides. The journal_init_inode() call +is for journals stored in filesystem inodes, or the journal_init_dev() +call can be use for journal stored on a raw device (in a continuous range +of blocks). A journal_t is a typedef for a struct pointer, so when +you are finally finished make sure you call journal_destroy() on it +to free up any used kernel memory. +</para> + +<para> +Once you have got your journal_t object you need to 'mount' or load the journal +file, unless of course you haven't initialised it yet - in which case you +need to call journal_create(). +</para> + +<para> +Most of the time however your journal file will already have been created, but +before you load it you must call journal_wipe() to empty the journal file. +Hang on, you say , what if the filesystem wasn't cleanly umount()'d . Well, it is the +job of the client file system to detect this and skip the call to journal_wipe(). +</para> + +<para> +In either case the next call should be to journal_load() which prepares the +journal file for use. Note that journal_wipe(..,0) calls journal_skip_recovery() +for you if it detects any outstanding transactions in the journal and similarly +journal_load() will call journal_recover() if necessary. +I would advise reading fs/ext3/super.c for examples on this stage. +[RGG: Why is the journal_wipe() call necessary - doesn't this needlessly +complicate the API. Or isn't a good idea for the journal layer to hide +dirty mounts from the client fs] +</para> + +<para> +Now you can go ahead and start modifying the underlying +filesystem. Almost. +</para> + +<para> + +You still need to actually journal your filesystem changes, this +is done by wrapping them into transactions. Additionally you +also need to wrap the modification of each of the buffers +with calls to the journal layer, so it knows what the modifications +you are actually making are. To do this use journal_start() which +returns a transaction handle. +</para> + +<para> +journal_start() +and its counterpart journal_stop(), which indicates the end of a transaction +are nestable calls, so you can reenter a transaction if necessary, +but remember you must call journal_stop() the same number of times as +journal_start() before the transaction is completed (or more accurately +leaves the update phase). Ext3/VFS makes use of this feature to simplify +quota support. +</para> + +<para> +Inside each transaction you need to wrap the modifications to the +individual buffers (blocks). Before you start to modify a buffer you +need to call journal_get_{create,write,undo}_access() as appropriate, +this allows the journalling layer to copy the unmodified data if it +needs to. After all the buffer may be part of a previously uncommitted +transaction. +At this point you are at last ready to modify a buffer, and once +you are have done so you need to call journal_dirty_{meta,}data(). +Or if you've asked for access to a buffer you now know is now longer +required to be pushed back on the device you can call journal_forget() +in much the same way as you might have used bforget() in the past. +</para> + +<para> +A journal_flush() may be called at any time to commit and checkpoint +all your transactions. +</para> + +<para> +Then at umount time , in your put_super() (2.4) or write_super() (2.5) +you can then call journal_destroy() to clean up your in-core journal object. +</para> + +<para> +Unfortunately there a couple of ways the journal layer can cause a deadlock. +The first thing to note is that each task can only have +a single outstanding transaction at any one time, remember nothing +commits until the outermost journal_stop(). This means +you must complete the transaction at the end of each file/inode/address +etc. operation you perform, so that the journalling system isn't re-entered +on another journal. Since transactions can't be nested/batched +across differing journals, and another filesystem other than +yours (say ext3) may be modified in a later syscall. +</para> + +<para> +The second case to bear in mind is that journal_start() can +block if there isn't enough space in the journal for your transaction +(based on the passed nblocks param) - when it blocks it merely(!) needs to +wait for transactions to complete and be committed from other tasks, +so essentially we are waiting for journal_stop(). So to avoid +deadlocks you must treat journal_start/stop() as if they +were semaphores and include them in your semaphore ordering rules to prevent +deadlocks. Note that journal_extend() has similar blocking behaviour to +journal_start() so you can deadlock here just as easily as on journal_start(). +</para> + +<para> +Try to reserve the right number of blocks the first time. ;-). This will +be the maximum number of blocks you are going to touch in this transaction. +I advise having a look at at least ext3_jbd.h to see the basis on which +ext3 uses to make these decisions. +</para> + +<para> +Another wriggle to watch out for is your on-disk block allocation strategy. +why? Because, if you undo a delete, you need to ensure you haven't reused any +of the freed blocks in a later transaction. One simple way of doing this +is make sure any blocks you allocate only have checkpointed transactions +listed against them. Ext3 does this in ext3_test_allocatable(). +</para> + +<para> +Lock is also providing through journal_{un,}lock_updates(), +ext3 uses this when it wants a window with a clean and stable fs for a moment. +eg. +</para> + +<programlisting> + + journal_lock_updates() //stop new stuff happening.. + journal_flush() // checkpoint everything. + ..do stuff on stable fs + journal_unlock_updates() // carry on with filesystem use. +</programlisting> + +<para> +The opportunities for abuse and DOS attacks with this should be obvious, +if you allow unprivileged userspace to trigger codepaths containing these +calls. +</para> + +<para> +A new feature of jbd since 2.5.25 is commit callbacks with the new +journal_callback_set() function you can now ask the journalling layer +to call you back when the transaction is finally committed to disk, so that +you can do some of your own management. The key to this is the journal_callback +struct, this maintains the internal callback information but you can +extend it like this:- +</para> +<programlisting> + struct myfs_callback_s { + //Data structure element required by jbd.. + struct journal_callback for_jbd; + // Stuff for myfs allocated together. + myfs_inode* i_commited; + + } +</programlisting> + +<para> +this would be useful if you needed to know when data was committed to a +particular inode. +</para> + + </sect2> + + <sect2 id="jbd_summary"> + <title>Summary</title> +<para> +Using the journal is a matter of wrapping the different context changes, +being each mount, each modification (transaction) and each changed buffer +to tell the journalling layer about them. +</para> + +<para> +Here is a some pseudo code to give you an idea of how it works, as +an example. +</para> + +<programlisting> + journal_t* my_jnrl = journal_create(); + journal_init_{dev,inode}(jnrl,...) + if (clean) journal_wipe(); + journal_load(); + + foreach(transaction) { /*transactions must be + completed before + a syscall returns to + userspace*/ + + handle_t * xct=journal_start(my_jnrl); + foreach(bh) { + journal_get_{create,write,undo}_access(xact,bh); + if ( myfs_modify(bh) ) { /* returns true + if makes changes */ + journal_dirty_{meta,}data(xact,bh); + } else { + journal_forget(bh); + } + } + journal_stop(xct); + } + journal_destroy(my_jrnl); +</programlisting> + </sect2> + + </sect1> + + <sect1 id="data_types"> + <title>Data Types</title> + <para> + The journalling layer uses typedefs to 'hide' the concrete definitions + of the structures used. As a client of the JBD layer you can + just rely on the using the pointer as a magic cookie of some sort. + + Obviously the hiding is not enforced as this is 'C'. + </para> + <sect2 id="structures"><title>Structures</title> +!Iinclude/linux/jbd.h + </sect2> + </sect1> + + <sect1 id="functions"> + <title>Functions</title> + <para> + The functions here are split into two groups those that + affect a journal as a whole, and those which are used to + manage transactions + </para> + <sect2 id="journal_level"><title>Journal Level</title> +!Efs/jbd/journal.c +!Ifs/jbd/recovery.c + </sect2> + <sect2 id="transaction_level"><title>Transasction Level</title> +!Efs/jbd/transaction.c + </sect2> + </sect1> + <sect1 id="see_also"> + <title>See also</title> + <para> + <citation> + <ulink url="ftp://ftp.uk.linux.org/pub/linux/sct/fs/jfs/journal-design.ps.gz"> + Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen Tweedie + </ulink> + </citation> + </para> + <para> + <citation> + <ulink url="http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html"> + Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen Tweedie + </ulink> + </citation> + </para> + </sect1> + + </chapter> + + <chapter id="splice"> + <title>splice API</title> + <para> + splice is a method for moving blocks of data around inside the + kernel, without continually transferring them between the kernel + and user space. + </para> +!Ffs/splice.c + </chapter> + + <chapter id="pipes"> + <title>pipes API</title> + <para> + Pipe interfaces are all for in-kernel (builtin image) use. + They are not exported for use by modules. + </para> +!Iinclude/linux/pipe_fs_i.h +!Ffs/pipe.c + </chapter> + +</book> diff --git a/Documentation/DocBook/gadget.tmpl b/Documentation/DocBook/gadget.tmpl new file mode 100644 index 0000000..6ef2f00 --- /dev/null +++ b/Documentation/DocBook/gadget.tmpl @@ -0,0 +1,793 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="USB-Gadget-API"> + <bookinfo> + <title>USB Gadget API for Linux</title> + <date>20 August 2004</date> + <edition>20 August 2004</edition> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + <copyright> + <year>2003-2004</year> + <holder>David Brownell</holder> + </copyright> + + <author> + <firstname>David</firstname> + <surname>Brownell</surname> + <affiliation> + <address><email>dbrownell@users.sourceforge.net</email></address> + </affiliation> + </author> + </bookinfo> + +<toc></toc> + +<chapter id="intro"><title>Introduction</title> + +<para>This document presents a Linux-USB "Gadget" +kernel mode +API, for use within peripherals and other USB devices +that embed Linux. +It provides an overview of the API structure, +and shows how that fits into a system development project. +This is the first such API released on Linux to address +a number of important problems, including: </para> + +<itemizedlist> + <listitem><para>Supports USB 2.0, for high speed devices which + can stream data at several dozen megabytes per second. + </para></listitem> + <listitem><para>Handles devices with dozens of endpoints just as + well as ones with just two fixed-function ones. Gadget drivers + can be written so they're easy to port to new hardware. + </para></listitem> + <listitem><para>Flexible enough to expose more complex USB device + capabilities such as multiple configurations, multiple interfaces, + composite devices, + and alternate interface settings. + </para></listitem> + <listitem><para>USB "On-The-Go" (OTG) support, in conjunction + with updates to the Linux-USB host side. + </para></listitem> + <listitem><para>Sharing data structures and API models with the + Linux-USB host side API. This helps the OTG support, and + looks forward to more-symmetric frameworks (where the same + I/O model is used by both host and device side drivers). + </para></listitem> + <listitem><para>Minimalist, so it's easier to support new device + controller hardware. I/O processing doesn't imply large + demands for memory or CPU resources. + </para></listitem> +</itemizedlist> + + +<para>Most Linux developers will not be able to use this API, since they +have USB "host" hardware in a PC, workstation, or server. +Linux users with embedded systems are more likely to +have USB peripheral hardware. +To distinguish drivers running inside such hardware from the +more familiar Linux "USB device drivers", +which are host side proxies for the real USB devices, +a different term is used: +the drivers inside the peripherals are "USB gadget drivers". +In USB protocol interactions, the device driver is the master +(or "client driver") +and the gadget driver is the slave (or "function driver"). +</para> + +<para>The gadget API resembles the host side Linux-USB API in that both +use queues of request objects to package I/O buffers, and those requests +may be submitted or canceled. +They share common definitions for the standard USB +<emphasis>Chapter 9</emphasis> messages, structures, and constants. +Also, both APIs bind and unbind drivers to devices. +The APIs differ in detail, since the host side's current +URB framework exposes a number of implementation details +and assumptions that are inappropriate for a gadget API. +While the model for control transfers and configuration +management is necessarily different (one side is a hardware-neutral master, +the other is a hardware-aware slave), the endpoint I/0 API used here +should also be usable for an overhead-reduced host side API. +</para> + +</chapter> + +<chapter id="structure"><title>Structure of Gadget Drivers</title> + +<para>A system running inside a USB peripheral +normally has at least three layers inside the kernel to handle +USB protocol processing, and may have additional layers in +user space code. +The "gadget" API is used by the middle layer to interact +with the lowest level (which directly handles hardware). +</para> + +<para>In Linux, from the bottom up, these layers are: +</para> + +<variablelist> + + <varlistentry> + <term><emphasis>USB Controller Driver</emphasis></term> + + <listitem> + <para>This is the lowest software level. + It is the only layer that talks to hardware, + through registers, fifos, dma, irqs, and the like. + The <filename><linux/usb/gadget.h></filename> API abstracts + the peripheral controller endpoint hardware. + That hardware is exposed through endpoint objects, which accept + streams of IN/OUT buffers, and through callbacks that interact + with gadget drivers. + Since normal USB devices only have one upstream + port, they only have one of these drivers. + The controller driver can support any number of different + gadget drivers, but only one of them can be used at a time. + </para> + + <para>Examples of such controller hardware include + the PCI-based NetChip 2280 USB 2.0 high speed controller, + the SA-11x0 or PXA-25x UDC (found within many PDAs), + and a variety of other products. + </para> + + </listitem></varlistentry> + + <varlistentry> + <term><emphasis>Gadget Driver</emphasis></term> + + <listitem> + <para>The lower boundary of this driver implements hardware-neutral + USB functions, using calls to the controller driver. + Because such hardware varies widely in capabilities and restrictions, + and is used in embedded environments where space is at a premium, + the gadget driver is often configured at compile time + to work with endpoints supported by one particular controller. + Gadget drivers may be portable to several different controllers, + using conditional compilation. + (Recent kernels substantially simplify the work involved in + supporting new hardware, by <emphasis>autoconfiguring</emphasis> + endpoints automatically for many bulk-oriented drivers.) + Gadget driver responsibilities include: + </para> + <itemizedlist> + <listitem><para>handling setup requests (ep0 protocol responses) + possibly including class-specific functionality + </para></listitem> + <listitem><para>returning configuration and string descriptors + </para></listitem> + <listitem><para>(re)setting configurations and interface + altsettings, including enabling and configuring endpoints + </para></listitem> + <listitem><para>handling life cycle events, such as managing + bindings to hardware, + USB suspend/resume, remote wakeup, + and disconnection from the USB host. + </para></listitem> + <listitem><para>managing IN and OUT transfers on all currently + enabled endpoints + </para></listitem> + </itemizedlist> + + <para> + Such drivers may be modules of proprietary code, although + that approach is discouraged in the Linux community. + </para> + </listitem></varlistentry> + + <varlistentry> + <term><emphasis>Upper Level</emphasis></term> + + <listitem> + <para>Most gadget drivers have an upper boundary that connects + to some Linux driver or framework in Linux. + Through that boundary flows the data which the gadget driver + produces and/or consumes through protocol transfers over USB. + Examples include: + </para> + <itemizedlist> + <listitem><para>user mode code, using generic (gadgetfs) + or application specific files in + <filename>/dev</filename> + </para></listitem> + <listitem><para>networking subsystem (for network gadgets, + like the CDC Ethernet Model gadget driver) + </para></listitem> + <listitem><para>data capture drivers, perhaps video4Linux or + a scanner driver; or test and measurement hardware. + </para></listitem> + <listitem><para>input subsystem (for HID gadgets) + </para></listitem> + <listitem><para>sound subsystem (for audio gadgets) + </para></listitem> + <listitem><para>file system (for PTP gadgets) + </para></listitem> + <listitem><para>block i/o subsystem (for usb-storage gadgets) + </para></listitem> + <listitem><para>... and more </para></listitem> + </itemizedlist> + </listitem></varlistentry> + + <varlistentry> + <term><emphasis>Additional Layers</emphasis></term> + + <listitem> + <para>Other layers may exist. + These could include kernel layers, such as network protocol stacks, + as well as user mode applications building on standard POSIX + system call APIs such as + <emphasis>open()</emphasis>, <emphasis>close()</emphasis>, + <emphasis>read()</emphasis> and <emphasis>write()</emphasis>. + On newer systems, POSIX Async I/O calls may be an option. + Such user mode code will not necessarily be subject to + the GNU General Public License (GPL). + </para> + </listitem></varlistentry> + + +</variablelist> + +<para>OTG-capable systems will also need to include a standard Linux-USB +host side stack, +with <emphasis>usbcore</emphasis>, +one or more <emphasis>Host Controller Drivers</emphasis> (HCDs), +<emphasis>USB Device Drivers</emphasis> to support +the OTG "Targeted Peripheral List", +and so forth. +There will also be an <emphasis>OTG Controller Driver</emphasis>, +which is visible to gadget and device driver developers only indirectly. +That helps the host and device side USB controllers implement the +two new OTG protocols (HNP and SRP). +Roles switch (host to peripheral, or vice versa) using HNP +during USB suspend processing, and SRP can be viewed as a +more battery-friendly kind of device wakeup protocol. +</para> + +<para>Over time, reusable utilities are evolving to help make some +gadget driver tasks simpler. +For example, building configuration descriptors from vectors of +descriptors for the configurations interfaces and endpoints is +now automated, and many drivers now use autoconfiguration to +choose hardware endpoints and initialize their descriptors. + +A potential example of particular interest +is code implementing standard USB-IF protocols for +HID, networking, storage, or audio classes. +Some developers are interested in KDB or KGDB hooks, to let +target hardware be remotely debugged. +Most such USB protocol code doesn't need to be hardware-specific, +any more than network protocols like X11, HTTP, or NFS are. +Such gadget-side interface drivers should eventually be combined, +to implement composite devices. +</para> + +</chapter> + + +<chapter id="api"><title>Kernel Mode Gadget API</title> + +<para>Gadget drivers declare themselves through a +<emphasis>struct usb_gadget_driver</emphasis>, which is responsible for +most parts of enumeration for a <emphasis>struct usb_gadget</emphasis>. +The response to a set_configuration usually involves +enabling one or more of the <emphasis>struct usb_ep</emphasis> objects +exposed by the gadget, and submitting one or more +<emphasis>struct usb_request</emphasis> buffers to transfer data. +Understand those four data types, and their operations, and +you will understand how this API works. +</para> + +<note><title>Incomplete Data Type Descriptions</title> + +<para>This documentation was prepared using the standard Linux +kernel <filename>docproc</filename> tool, which turns text +and in-code comments into SGML DocBook and then into usable +formats such as HTML or PDF. +Other than the "Chapter 9" data types, most of the significant +data types and functions are described here. +</para> + +<para>However, docproc does not understand all the C constructs +that are used, so some relevant information is likely omitted from +what you are reading. +One example of such information is endpoint autoconfiguration. +You'll have to read the header file, and use example source +code (such as that for "Gadget Zero"), to fully understand the API. +</para> + +<para>The part of the API implementing some basic +driver capabilities is specific to the version of the +Linux kernel that's in use. +The 2.6 kernel includes a <emphasis>driver model</emphasis> +framework that has no analogue on earlier kernels; +so those parts of the gadget API are not fully portable. +(They are implemented on 2.4 kernels, but in a different way.) +The driver model state is another part of this API that is +ignored by the kerneldoc tools. +</para> +</note> + +<para>The core API does not expose +every possible hardware feature, only the most widely available ones. +There are significant hardware features, such as device-to-device DMA +(without temporary storage in a memory buffer) +that would be added using hardware-specific APIs. +</para> + +<para>This API allows drivers to use conditional compilation to handle +endpoint capabilities of different hardware, but doesn't require that. +Hardware tends to have arbitrary restrictions, relating to +transfer types, addressing, packet sizes, buffering, and availability. +As a rule, such differences only matter for "endpoint zero" logic +that handles device configuration and management. +The API supports limited run-time +detection of capabilities, through naming conventions for endpoints. +Many drivers will be able to at least partially autoconfigure +themselves. +In particular, driver init sections will often have endpoint +autoconfiguration logic that scans the hardware's list of endpoints +to find ones matching the driver requirements +(relying on those conventions), to eliminate some of the most +common reasons for conditional compilation. +</para> + +<para>Like the Linux-USB host side API, this API exposes +the "chunky" nature of USB messages: I/O requests are in terms +of one or more "packets", and packet boundaries are visible to drivers. +Compared to RS-232 serial protocols, USB resembles +synchronous protocols like HDLC +(N bytes per frame, multipoint addressing, host as the primary +station and devices as secondary stations) +more than asynchronous ones +(tty style: 8 data bits per frame, no parity, one stop bit). +So for example the controller drivers won't buffer +two single byte writes into a single two-byte USB IN packet, +although gadget drivers may do so when they implement +protocols where packet boundaries (and "short packets") +are not significant. +</para> + +<sect1 id="lifecycle"><title>Driver Life Cycle</title> + +<para>Gadget drivers make endpoint I/O requests to hardware without +needing to know many details of the hardware, but driver +setup/configuration code needs to handle some differences. +Use the API like this: +</para> + +<orderedlist numeration='arabic'> + +<listitem><para>Register a driver for the particular device side +usb controller hardware, +such as the net2280 on PCI (USB 2.0), +sa11x0 or pxa25x as found in Linux PDAs, +and so on. +At this point the device is logically in the USB ch9 initial state +("attached"), drawing no power and not usable +(since it does not yet support enumeration). +Any host should not see the device, since it's not +activated the data line pullup used by the host to +detect a device, even if VBUS power is available. +</para></listitem> + +<listitem><para>Register a gadget driver that implements some higher level +device function. That will then bind() to a usb_gadget, which +activates the data line pullup sometime after detecting VBUS. +</para></listitem> + +<listitem><para>The hardware driver can now start enumerating. +The steps it handles are to accept USB power and set_address requests. +Other steps are handled by the gadget driver. +If the gadget driver module is unloaded before the host starts to +enumerate, steps before step 7 are skipped. +</para></listitem> + +<listitem><para>The gadget driver's setup() call returns usb descriptors, +based both on what the bus interface hardware provides and on the +functionality being implemented. +That can involve alternate settings or configurations, +unless the hardware prevents such operation. +For OTG devices, each configuration descriptor includes +an OTG descriptor. +</para></listitem> + +<listitem><para>The gadget driver handles the last step of enumeration, +when the USB host issues a set_configuration call. +It enables all endpoints used in that configuration, +with all interfaces in their default settings. +That involves using a list of the hardware's endpoints, enabling each +endpoint according to its descriptor. +It may also involve using <function>usb_gadget_vbus_draw</function> +to let more power be drawn from VBUS, as allowed by that configuration. +For OTG devices, setting a configuration may also involve reporting +HNP capabilities through a user interface. +</para></listitem> + +<listitem><para>Do real work and perform data transfers, possibly involving +changes to interface settings or switching to new configurations, until the +device is disconnect()ed from the host. +Queue any number of transfer requests to each endpoint. +It may be suspended and resumed several times before being disconnected. +On disconnect, the drivers go back to step 3 (above). +</para></listitem> + +<listitem><para>When the gadget driver module is being unloaded, +the driver unbind() callback is issued. That lets the controller +driver be unloaded. +</para></listitem> + +</orderedlist> + +<para>Drivers will normally be arranged so that just loading the +gadget driver module (or statically linking it into a Linux kernel) +allows the peripheral device to be enumerated, but some drivers +will defer enumeration until some higher level component (like +a user mode daemon) enables it. +Note that at this lowest level there are no policies about how +ep0 configuration logic is implemented, +except that it should obey USB specifications. +Such issues are in the domain of gadget drivers, +including knowing about implementation constraints +imposed by some USB controllers +or understanding that composite devices might happen to +be built by integrating reusable components. +</para> + +<para>Note that the lifecycle above can be slightly different +for OTG devices. +Other than providing an additional OTG descriptor in each +configuration, only the HNP-related differences are particularly +visible to driver code. +They involve reporting requirements during the SET_CONFIGURATION +request, and the option to invoke HNP during some suspend callbacks. +Also, SRP changes the semantics of +<function>usb_gadget_wakeup</function> +slightly. +</para> + +</sect1> + +<sect1 id="ch9"><title>USB 2.0 Chapter 9 Types and Constants</title> + +<para>Gadget drivers +rely on common USB structures and constants +defined in the +<filename><linux/usb/ch9.h></filename> +header file, which is standard in Linux 2.6 kernels. +These are the same types and constants used by host +side drivers (and usbcore). +</para> + +!Iinclude/linux/usb/ch9.h +</sect1> + +<sect1 id="core"><title>Core Objects and Methods</title> + +<para>These are declared in +<filename><linux/usb/gadget.h></filename>, +and are used by gadget drivers to interact with +USB peripheral controller drivers. +</para> + + <!-- yeech, this is ugly in nsgmls PDF output. + + the PDF bookmark and refentry output nesting is wrong, + and the member/argument documentation indents ugly. + + plus something (docproc?) adds whitespace before the + descriptive paragraph text, so it can't line up right + unless the explanations are trivial. + --> + +!Iinclude/linux/usb/gadget.h +</sect1> + +<sect1 id="utils"><title>Optional Utilities</title> + +<para>The core API is sufficient for writing a USB Gadget Driver, +but some optional utilities are provided to simplify common tasks. +These utilities include endpoint autoconfiguration. +</para> + +!Edrivers/usb/gadget/usbstring.c +!Edrivers/usb/gadget/config.c +<!-- !Edrivers/usb/gadget/epautoconf.c --> +</sect1> + +<sect1 id="composite"><title>Composite Device Framework</title> + +<para>The core API is sufficient for writing drivers for composite +USB devices (with more than one function in a given configuration), +and also multi-configuration devices (also more than one function, +but not necessarily sharing a given configuration). +There is however an optional framework which makes it easier to +reuse and combine functions. +</para> + +<para>Devices using this framework provide a <emphasis>struct +usb_composite_driver</emphasis>, which in turn provides one or +more <emphasis>struct usb_configuration</emphasis> instances. +Each such configuration includes at least one +<emphasis>struct usb_function</emphasis>, which packages a user +visible role such as "network link" or "mass storage device". +Management functions may also exist, such as "Device Firmware +Upgrade". +</para> + +!Iinclude/linux/usb/composite.h +!Edrivers/usb/gadget/composite.c + +</sect1> + +<sect1 id="functions"><title>Composite Device Functions</title> + +<para>At this writing, a few of the current gadget drivers have +been converted to this framework. +Near-term plans include converting all of them, except for "gadgetfs". +</para> + +!Edrivers/usb/gadget/f_acm.c +!Edrivers/usb/gadget/f_ecm.c +!Edrivers/usb/gadget/f_subset.c +!Edrivers/usb/gadget/f_obex.c +!Edrivers/usb/gadget/f_serial.c + +</sect1> + + +</chapter> + +<chapter id="controllers"><title>Peripheral Controller Drivers</title> + +<para>The first hardware supporting this API was the NetChip 2280 +controller, which supports USB 2.0 high speed and is based on PCI. +This is the <filename>net2280</filename> driver module. +The driver supports Linux kernel versions 2.4 and 2.6; +contact NetChip Technologies for development boards and product +information. +</para> + +<para>Other hardware working in the "gadget" framework includes: +Intel's PXA 25x and IXP42x series processors +(<filename>pxa2xx_udc</filename>), +Toshiba TC86c001 "Goku-S" (<filename>goku_udc</filename>), +Renesas SH7705/7727 (<filename>sh_udc</filename>), +MediaQ 11xx (<filename>mq11xx_udc</filename>), +Hynix HMS30C7202 (<filename>h7202_udc</filename>), +National 9303/4 (<filename>n9604_udc</filename>), +Texas Instruments OMAP (<filename>omap_udc</filename>), +Sharp LH7A40x (<filename>lh7a40x_udc</filename>), +and more. +Most of those are full speed controllers. +</para> + +<para>At this writing, there are people at work on drivers in +this framework for several other USB device controllers, +with plans to make many of them be widely available. +</para> + +<!-- !Edrivers/usb/gadget/net2280.c --> + +<para>A partial USB simulator, +the <filename>dummy_hcd</filename> driver, is available. +It can act like a net2280, a pxa25x, or an sa11x0 in terms +of available endpoints and device speeds; and it simulates +control, bulk, and to some extent interrupt transfers. +That lets you develop some parts of a gadget driver on a normal PC, +without any special hardware, and perhaps with the assistance +of tools such as GDB running with User Mode Linux. +At least one person has expressed interest in adapting that +approach, hooking it up to a simulator for a microcontroller. +Such simulators can help debug subsystems where the runtime hardware +is unfriendly to software development, or is not yet available. +</para> + +<para>Support for other controllers is expected to be developed +and contributed +over time, as this driver framework evolves. +</para> + +</chapter> + +<chapter id="gadget"><title>Gadget Drivers</title> + +<para>In addition to <emphasis>Gadget Zero</emphasis> +(used primarily for testing and development with drivers +for usb controller hardware), other gadget drivers exist. +</para> + +<para>There's an <emphasis>ethernet</emphasis> gadget +driver, which implements one of the most useful +<emphasis>Communications Device Class</emphasis> (CDC) models. +One of the standards for cable modem interoperability even +specifies the use of this ethernet model as one of two +mandatory options. +Gadgets using this code look to a USB host as if they're +an Ethernet adapter. +It provides access to a network where the gadget's CPU is one host, +which could easily be bridging, routing, or firewalling +access to other networks. +Since some hardware can't fully implement the CDC Ethernet +requirements, this driver also implements a "good parts only" +subset of CDC Ethernet. +(That subset doesn't advertise itself as CDC Ethernet, +to avoid creating problems.) +</para> + +<para>Support for Microsoft's <emphasis>RNDIS</emphasis> +protocol has been contributed by Pengutronix and Auerswald GmbH. +This is like CDC Ethernet, but it runs on more slightly USB hardware +(but less than the CDC subset). +However, its main claim to fame is being able to connect directly to +recent versions of Windows, using drivers that Microsoft bundles +and supports, making it much simpler to network with Windows. +</para> + +<para>There is also support for user mode gadget drivers, +using <emphasis>gadgetfs</emphasis>. +This provides a <emphasis>User Mode API</emphasis> that presents +each endpoint as a single file descriptor. I/O is done using +normal <emphasis>read()</emphasis> and <emphasis>read()</emphasis> calls. +Familiar tools like GDB and pthreads can be used to +develop and debug user mode drivers, so that once a robust +controller driver is available many applications for it +won't require new kernel mode software. +Linux 2.6 <emphasis>Async I/O (AIO)</emphasis> +support is available, so that user mode software +can stream data with only slightly more overhead +than a kernel driver. +</para> + +<para>There's a USB Mass Storage class driver, which provides +a different solution for interoperability with systems such +as MS-Windows and MacOS. +That <emphasis>File-backed Storage</emphasis> driver uses a +file or block device as backing store for a drive, +like the <filename>loop</filename> driver. +The USB host uses the BBB, CB, or CBI versions of the mass +storage class specification, using transparent SCSI commands +to access the data from the backing store. +</para> + +<para>There's a "serial line" driver, useful for TTY style +operation over USB. +The latest version of that driver supports CDC ACM style +operation, like a USB modem, and so on most hardware it can +interoperate easily with MS-Windows. +One interesting use of that driver is in boot firmware (like a BIOS), +which can sometimes use that model with very small systems without +real serial lines. +</para> + +<para>Support for other kinds of gadget is expected to +be developed and contributed +over time, as this driver framework evolves. +</para> + +</chapter> + +<chapter id="otg"><title>USB On-The-GO (OTG)</title> + +<para>USB OTG support on Linux 2.6 was initially developed +by Texas Instruments for +<ulink url="http://www.omap.com">OMAP</ulink> 16xx and 17xx +series processors. +Other OTG systems should work in similar ways, but the +hardware level details could be very different. +</para> + +<para>Systems need specialized hardware support to implement OTG, +notably including a special <emphasis>Mini-AB</emphasis> jack +and associated transciever to support <emphasis>Dual-Role</emphasis> +operation: +they can act either as a host, using the standard +Linux-USB host side driver stack, +or as a peripheral, using this "gadget" framework. +To do that, the system software relies on small additions +to those programming interfaces, +and on a new internal component (here called an "OTG Controller") +affecting which driver stack connects to the OTG port. +In each role, the system can re-use the existing pool of +hardware-neutral drivers, layered on top of the controller +driver interfaces (<emphasis>usb_bus</emphasis> or +<emphasis>usb_gadget</emphasis>). +Such drivers need at most minor changes, and most of the calls +added to support OTG can also benefit non-OTG products. +</para> + +<itemizedlist> + <listitem><para>Gadget drivers test the <emphasis>is_otg</emphasis> + flag, and use it to determine whether or not to include + an OTG descriptor in each of their configurations. + </para></listitem> + <listitem><para>Gadget drivers may need changes to support the + two new OTG protocols, exposed in new gadget attributes + such as <emphasis>b_hnp_enable</emphasis> flag. + HNP support should be reported through a user interface + (two LEDs could suffice), and is triggered in some cases + when the host suspends the peripheral. + SRP support can be user-initiated just like remote wakeup, + probably by pressing the same button. + </para></listitem> + <listitem><para>On the host side, USB device drivers need + to be taught to trigger HNP at appropriate moments, using + <function>usb_suspend_device()</function>. + That also conserves battery power, which is useful even + for non-OTG configurations. + </para></listitem> + <listitem><para>Also on the host side, a driver must support the + OTG "Targeted Peripheral List". That's just a whitelist, + used to reject peripherals not supported with a given + Linux OTG host. + <emphasis>This whitelist is product-specific; + each product must modify <filename>otg_whitelist.h</filename> + to match its interoperability specification. + </emphasis> + </para> + <para>Non-OTG Linux hosts, like PCs and workstations, + normally have some solution for adding drivers, so that + peripherals that aren't recognized can eventually be supported. + That approach is unreasonable for consumer products that may + never have their firmware upgraded, and where it's usually + unrealistic to expect traditional PC/workstation/server kinds + of support model to work. + For example, it's often impractical to change device firmware + once the product has been distributed, so driver bugs can't + normally be fixed if they're found after shipment. + </para></listitem> +</itemizedlist> + +<para> +Additional changes are needed below those hardware-neutral +<emphasis>usb_bus</emphasis> and <emphasis>usb_gadget</emphasis> +driver interfaces; those aren't discussed here in any detail. +Those affect the hardware-specific code for each USB Host or Peripheral +controller, and how the HCD initializes (since OTG can be active only +on a single port). +They also involve what may be called an <emphasis>OTG Controller +Driver</emphasis>, managing the OTG transceiver and the OTG state +machine logic as well as much of the root hub behavior for the +OTG port. +The OTG controller driver needs to activate and deactivate USB +controllers depending on the relevant device role. +Some related changes were needed inside usbcore, so that it +can identify OTG-capable devices and respond appropriately +to HNP or SRP protocols. +</para> + +</chapter> + +</book> +<!-- + vim:syntax=sgml:sw=4 +--> diff --git a/Documentation/DocBook/genericirq.tmpl b/Documentation/DocBook/genericirq.tmpl new file mode 100644 index 0000000..3a882d9 --- /dev/null +++ b/Documentation/DocBook/genericirq.tmpl @@ -0,0 +1,474 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="Generic-IRQ-Guide"> + <bookinfo> + <title>Linux generic IRQ handling</title> + + <authorgroup> + <author> + <firstname>Thomas</firstname> + <surname>Gleixner</surname> + <affiliation> + <address> + <email>tglx@linutronix.de</email> + </address> + </affiliation> + </author> + <author> + <firstname>Ingo</firstname> + <surname>Molnar</surname> + <affiliation> + <address> + <email>mingo@elte.hu</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2005-2006</year> + <holder>Thomas Gleixner</holder> + </copyright> + <copyright> + <year>2005-2006</year> + <holder>Ingo Molnar</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License version 2 as published by the Free Software Foundation. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <para> + The generic interrupt handling layer is designed to provide a + complete abstraction of interrupt handling for device drivers. + It is able to handle all the different types of interrupt controller + hardware. Device drivers use generic API functions to request, enable, + disable and free interrupts. The drivers do not have to know anything + about interrupt hardware details, so they can be used on different + platforms without code changes. + </para> + <para> + This documentation is provided to developers who want to implement + an interrupt subsystem based for their architecture, with the help + of the generic IRQ handling layer. + </para> + </chapter> + + <chapter id="rationale"> + <title>Rationale</title> + <para> + The original implementation of interrupt handling in Linux is using + the __do_IRQ() super-handler, which is able to deal with every + type of interrupt logic. + </para> + <para> + Originally, Russell King identified different types of handlers to + build a quite universal set for the ARM interrupt handler + implementation in Linux 2.5/2.6. He distinguished between: + <itemizedlist> + <listitem><para>Level type</para></listitem> + <listitem><para>Edge type</para></listitem> + <listitem><para>Simple type</para></listitem> + </itemizedlist> + In the SMP world of the __do_IRQ() super-handler another type + was identified: + <itemizedlist> + <listitem><para>Per CPU type</para></listitem> + </itemizedlist> + </para> + <para> + This split implementation of highlevel IRQ handlers allows us to + optimize the flow of the interrupt handling for each specific + interrupt type. This reduces complexity in that particular codepath + and allows the optimized handling of a given type. + </para> + <para> + The original general IRQ implementation used hw_interrupt_type + structures and their ->ack(), ->end() [etc.] callbacks to + differentiate the flow control in the super-handler. This leads to + a mix of flow logic and lowlevel hardware logic, and it also leads + to unnecessary code duplication: for example in i386, there is a + ioapic_level_irq and a ioapic_edge_irq irq-type which share many + of the lowlevel details but have different flow handling. + </para> + <para> + A more natural abstraction is the clean separation of the + 'irq flow' and the 'chip details'. + </para> + <para> + Analysing a couple of architecture's IRQ subsystem implementations + reveals that most of them can use a generic set of 'irq flow' + methods and only need to add the chip level specific code. + The separation is also valuable for (sub)architectures + which need specific quirks in the irq flow itself but not in the + chip-details - and thus provides a more transparent IRQ subsystem + design. + </para> + <para> + Each interrupt descriptor is assigned its own highlevel flow + handler, which is normally one of the generic + implementations. (This highlevel flow handler implementation also + makes it simple to provide demultiplexing handlers which can be + found in embedded platforms on various architectures.) + </para> + <para> + The separation makes the generic interrupt handling layer more + flexible and extensible. For example, an (sub)architecture can + use a generic irq-flow implementation for 'level type' interrupts + and add a (sub)architecture specific 'edge type' implementation. + </para> + <para> + To make the transition to the new model easier and prevent the + breakage of existing implementations, the __do_IRQ() super-handler + is still available. This leads to a kind of duality for the time + being. Over time the new model should be used in more and more + architectures, as it enables smaller and cleaner IRQ subsystems. + </para> + </chapter> + <chapter id="bugs"> + <title>Known Bugs And Assumptions</title> + <para> + None (knock on wood). + </para> + </chapter> + + <chapter id="Abstraction"> + <title>Abstraction layers</title> + <para> + There are three main levels of abstraction in the interrupt code: + <orderedlist> + <listitem><para>Highlevel driver API</para></listitem> + <listitem><para>Highlevel IRQ flow handlers</para></listitem> + <listitem><para>Chiplevel hardware encapsulation</para></listitem> + </orderedlist> + </para> + <sect1 id="Interrupt_control_flow"> + <title>Interrupt control flow</title> + <para> + Each interrupt is described by an interrupt descriptor structure + irq_desc. The interrupt is referenced by an 'unsigned int' numeric + value which selects the corresponding interrupt decription structure + in the descriptor structures array. + The descriptor structure contains status information and pointers + to the interrupt flow method and the interrupt chip structure + which are assigned to this interrupt. + </para> + <para> + Whenever an interrupt triggers, the lowlevel arch code calls into + the generic interrupt code by calling desc->handle_irq(). + This highlevel IRQ handling function only uses desc->chip primitives + referenced by the assigned chip descriptor structure. + </para> + </sect1> + <sect1 id="Highlevel_Driver_API"> + <title>Highlevel Driver API</title> + <para> + The highlevel Driver API consists of following functions: + <itemizedlist> + <listitem><para>request_irq()</para></listitem> + <listitem><para>free_irq()</para></listitem> + <listitem><para>disable_irq()</para></listitem> + <listitem><para>enable_irq()</para></listitem> + <listitem><para>disable_irq_nosync() (SMP only)</para></listitem> + <listitem><para>synchronize_irq() (SMP only)</para></listitem> + <listitem><para>set_irq_type()</para></listitem> + <listitem><para>set_irq_wake()</para></listitem> + <listitem><para>set_irq_data()</para></listitem> + <listitem><para>set_irq_chip()</para></listitem> + <listitem><para>set_irq_chip_data()</para></listitem> + </itemizedlist> + See the autogenerated function documentation for details. + </para> + </sect1> + <sect1 id="Highlevel_IRQ_flow_handlers"> + <title>Highlevel IRQ flow handlers</title> + <para> + The generic layer provides a set of pre-defined irq-flow methods: + <itemizedlist> + <listitem><para>handle_level_irq</para></listitem> + <listitem><para>handle_edge_irq</para></listitem> + <listitem><para>handle_simple_irq</para></listitem> + <listitem><para>handle_percpu_irq</para></listitem> + </itemizedlist> + The interrupt flow handlers (either predefined or architecture + specific) are assigned to specific interrupts by the architecture + either during bootup or during device initialization. + </para> + <sect2 id="Default_flow_implementations"> + <title>Default flow implementations</title> + <sect3 id="Helper_functions"> + <title>Helper functions</title> + <para> + The helper functions call the chip primitives and + are used by the default flow implementations. + The following helper functions are implemented (simplified excerpt): + <programlisting> +default_enable(irq) +{ + desc->chip->unmask(irq); +} + +default_disable(irq) +{ + if (!delay_disable(irq)) + desc->chip->mask(irq); +} + +default_ack(irq) +{ + chip->ack(irq); +} + +default_mask_ack(irq) +{ + if (chip->mask_ack) { + chip->mask_ack(irq); + } else { + chip->mask(irq); + chip->ack(irq); + } +} + +noop(irq) +{ +} + + </programlisting> + </para> + </sect3> + </sect2> + <sect2 id="Default_flow_handler_implementations"> + <title>Default flow handler implementations</title> + <sect3 id="Default_Level_IRQ_flow_handler"> + <title>Default Level IRQ flow handler</title> + <para> + handle_level_irq provides a generic implementation + for level-triggered interrupts. + </para> + <para> + The following control flow is implemented (simplified excerpt): + <programlisting> +desc->chip->start(); +handle_IRQ_event(desc->action); +desc->chip->end(); + </programlisting> + </para> + </sect3> + <sect3 id="Default_Edge_IRQ_flow_handler"> + <title>Default Edge IRQ flow handler</title> + <para> + handle_edge_irq provides a generic implementation + for edge-triggered interrupts. + </para> + <para> + The following control flow is implemented (simplified excerpt): + <programlisting> +if (desc->status & running) { + desc->chip->hold(); + desc->status |= pending | masked; + return; +} +desc->chip->start(); +desc->status |= running; +do { + if (desc->status & masked) + desc->chip->enable(); + desc->status &= ~pending; + handle_IRQ_event(desc->action); +} while (status & pending); +desc->status &= ~running; +desc->chip->end(); + </programlisting> + </para> + </sect3> + <sect3 id="Default_simple_IRQ_flow_handler"> + <title>Default simple IRQ flow handler</title> + <para> + handle_simple_irq provides a generic implementation + for simple interrupts. + </para> + <para> + Note: The simple flow handler does not call any + handler/chip primitives. + </para> + <para> + The following control flow is implemented (simplified excerpt): + <programlisting> +handle_IRQ_event(desc->action); + </programlisting> + </para> + </sect3> + <sect3 id="Default_per_CPU_flow_handler"> + <title>Default per CPU flow handler</title> + <para> + handle_percpu_irq provides a generic implementation + for per CPU interrupts. + </para> + <para> + Per CPU interrupts are only available on SMP and + the handler provides a simplified version without + locking. + </para> + <para> + The following control flow is implemented (simplified excerpt): + <programlisting> +desc->chip->start(); +handle_IRQ_event(desc->action); +desc->chip->end(); + </programlisting> + </para> + </sect3> + </sect2> + <sect2 id="Quirks_and_optimizations"> + <title>Quirks and optimizations</title> + <para> + The generic functions are intended for 'clean' architectures and chips, + which have no platform-specific IRQ handling quirks. If an architecture + needs to implement quirks on the 'flow' level then it can do so by + overriding the highlevel irq-flow handler. + </para> + </sect2> + <sect2 id="Delayed_interrupt_disable"> + <title>Delayed interrupt disable</title> + <para> + This per interrupt selectable feature, which was introduced by Russell + King in the ARM interrupt implementation, does not mask an interrupt + at the hardware level when disable_irq() is called. The interrupt is + kept enabled and is masked in the flow handler when an interrupt event + happens. This prevents losing edge interrupts on hardware which does + not store an edge interrupt event while the interrupt is disabled at + the hardware level. When an interrupt arrives while the IRQ_DISABLED + flag is set, then the interrupt is masked at the hardware level and + the IRQ_PENDING bit is set. When the interrupt is re-enabled by + enable_irq() the pending bit is checked and if it is set, the + interrupt is resent either via hardware or by a software resend + mechanism. (It's necessary to enable CONFIG_HARDIRQS_SW_RESEND when + you want to use the delayed interrupt disable feature and your + hardware is not capable of retriggering an interrupt.) + The delayed interrupt disable can be runtime enabled, per interrupt, + by setting the IRQ_DELAYED_DISABLE flag in the irq_desc status field. + </para> + </sect2> + </sect1> + <sect1 id="Chiplevel_hardware_encapsulation"> + <title>Chiplevel hardware encapsulation</title> + <para> + The chip level hardware descriptor structure irq_chip + contains all the direct chip relevant functions, which + can be utilized by the irq flow implementations. + <itemizedlist> + <listitem><para>ack()</para></listitem> + <listitem><para>mask_ack() - Optional, recommended for performance</para></listitem> + <listitem><para>mask()</para></listitem> + <listitem><para>unmask()</para></listitem> + <listitem><para>retrigger() - Optional</para></listitem> + <listitem><para>set_type() - Optional</para></listitem> + <listitem><para>set_wake() - Optional</para></listitem> + </itemizedlist> + These primitives are strictly intended to mean what they say: ack means + ACK, masking means masking of an IRQ line, etc. It is up to the flow + handler(s) to use these basic units of lowlevel functionality. + </para> + </sect1> + </chapter> + + <chapter id="doirq"> + <title>__do_IRQ entry point</title> + <para> + The original implementation __do_IRQ() is an alternative entry + point for all types of interrupts. + </para> + <para> + This handler turned out to be not suitable for all + interrupt hardware and was therefore reimplemented with split + functionality for egde/level/simple/percpu interrupts. This is not + only a functional optimization. It also shortens code paths for + interrupts. + </para> + <para> + To make use of the split implementation, replace the call to + __do_IRQ by a call to desc->chip->handle_irq() and associate + the appropriate handler function to desc->chip->handle_irq(). + In most cases the generic handler implementations should + be sufficient. + </para> + </chapter> + + <chapter id="locking"> + <title>Locking on SMP</title> + <para> + The locking of chip registers is up to the architecture that + defines the chip primitives. There is a chip->lock field that can be used + for serialization, but the generic layer does not touch it. The per-irq + structure is protected via desc->lock, by the generic layer. + </para> + </chapter> + <chapter id="structs"> + <title>Structures</title> + <para> + This chapter contains the autogenerated documentation of the structures which are + used in the generic IRQ layer. + </para> +!Iinclude/linux/irq.h + </chapter> + + <chapter id="pubfunctions"> + <title>Public Functions Provided</title> + <para> + This chapter contains the autogenerated documentation of the kernel API functions + which are exported. + </para> +!Ekernel/irq/manage.c +!Ekernel/irq/chip.c + </chapter> + + <chapter id="intfunctions"> + <title>Internal Functions Provided</title> + <para> + This chapter contains the autogenerated documentation of the internal functions. + </para> +!Ikernel/irq/handle.c +!Ikernel/irq/chip.c + </chapter> + + <chapter id="credits"> + <title>Credits</title> + <para> + The following people have contributed to this document: + <orderedlist> + <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem> + <listitem><para>Ingo Molnar<email>mingo@elte.hu</email></para></listitem> + </orderedlist> + </para> + </chapter> +</book> diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl new file mode 100644 index 0000000..5818ff7 --- /dev/null +++ b/Documentation/DocBook/kernel-api.tmpl @@ -0,0 +1,707 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="LinuxKernelAPI"> + <bookinfo> + <title>The Linux Kernel API</title> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="Basics"> + <title>Driver Basics</title> + <sect1><title>Driver Entry and Exit points</title> +!Iinclude/linux/init.h + </sect1> + + <sect1><title>Atomic and pointer manipulation</title> +!Iarch/x86/include/asm/atomic_32.h +!Iarch/x86/include/asm/unaligned.h + </sect1> + + <sect1><title>Delaying, scheduling, and timer routines</title> +!Iinclude/linux/sched.h +!Ekernel/sched.c +!Ekernel/timer.c + </sect1> + <sect1><title>High-resolution timers</title> +!Iinclude/linux/ktime.h +!Iinclude/linux/hrtimer.h +!Ekernel/hrtimer.c + </sect1> + <sect1><title>Workqueues and Kevents</title> +!Ekernel/workqueue.c + </sect1> + <sect1><title>Internal Functions</title> +!Ikernel/exit.c +!Ikernel/signal.c +!Iinclude/linux/kthread.h +!Ekernel/kthread.c + </sect1> + + <sect1><title>Kernel objects manipulation</title> +<!-- +X!Iinclude/linux/kobject.h +--> +!Elib/kobject.c + </sect1> + + <sect1><title>Kernel utility functions</title> +!Iinclude/linux/kernel.h +!Ekernel/printk.c +!Ekernel/panic.c +!Ekernel/sys.c +!Ekernel/rcupdate.c + </sect1> + + <sect1><title>Device Resource Management</title> +!Edrivers/base/devres.c + </sect1> + + </chapter> + + <chapter id="adt"> + <title>Data Types</title> + <sect1><title>Doubly Linked Lists</title> +!Iinclude/linux/list.h + </sect1> + </chapter> + + <chapter id="libc"> + <title>Basic C Library Functions</title> + + <para> + When writing drivers, you cannot in general use routines which are + from the C Library. Some of the functions have been found generally + useful and they are listed below. The behaviour of these functions + may vary slightly from those defined by ANSI, and these deviations + are noted in the text. + </para> + + <sect1><title>String Conversions</title> +!Ilib/vsprintf.c +!Elib/vsprintf.c + </sect1> + <sect1><title>String Manipulation</title> +<!-- All functions are exported at now +X!Ilib/string.c + --> +!Elib/string.c + </sect1> + <sect1><title>Bit Operations</title> +!Iarch/x86/include/asm/bitops.h + </sect1> + </chapter> + + <chapter id="kernel-lib"> + <title>Basic Kernel Library Functions</title> + + <para> + The Linux kernel provides more basic utility functions. + </para> + + <sect1><title>Bitmap Operations</title> +!Elib/bitmap.c +!Ilib/bitmap.c + </sect1> + + <sect1><title>Command-line Parsing</title> +!Elib/cmdline.c + </sect1> + + <sect1 id="crc"><title>CRC Functions</title> +!Elib/crc7.c +!Elib/crc16.c +!Elib/crc-itu-t.c +!Elib/crc32.c +!Elib/crc-ccitt.c + </sect1> + </chapter> + + <chapter id="mm"> + <title>Memory Management in Linux</title> + <sect1><title>The Slab Cache</title> +!Iinclude/linux/slab.h +!Emm/slab.c + </sect1> + <sect1><title>User Space Memory Access</title> +!Iarch/x86/include/asm/uaccess_32.h +!Earch/x86/lib/usercopy_32.c + </sect1> + <sect1><title>More Memory Management Functions</title> +!Emm/readahead.c +!Emm/filemap.c +!Emm/memory.c +!Emm/vmalloc.c +!Imm/page_alloc.c +!Emm/mempool.c +!Emm/dmapool.c +!Emm/page-writeback.c +!Emm/truncate.c + </sect1> + </chapter> + + + <chapter id="ipc"> + <title>Kernel IPC facilities</title> + + <sect1><title>IPC utilities</title> +!Iipc/util.c + </sect1> + </chapter> + + <chapter id="kfifo"> + <title>FIFO Buffer</title> + <sect1><title>kfifo interface</title> +!Iinclude/linux/kfifo.h +!Ekernel/kfifo.c + </sect1> + </chapter> + + <chapter id="relayfs"> + <title>relay interface support</title> + + <para> + Relay interface support + is designed to provide an efficient mechanism for tools and + facilities to relay large amounts of data from kernel space to + user space. + </para> + + <sect1><title>relay interface</title> +!Ekernel/relay.c +!Ikernel/relay.c + </sect1> + </chapter> + + <chapter id="modload"> + <title>Module Support</title> + <sect1><title>Module Loading</title> +!Ekernel/kmod.c + </sect1> + <sect1><title>Inter Module support</title> + <para> + Refer to the file kernel/module.c for more information. + </para> +<!-- FIXME: Removed for now since no structured comments in source +X!Ekernel/module.c +--> + </sect1> + </chapter> + + <chapter id="hardware"> + <title>Hardware Interfaces</title> + <sect1><title>Interrupt Handling</title> +!Ekernel/irq/manage.c + </sect1> + + <sect1><title>DMA Channels</title> +!Ekernel/dma.c + </sect1> + + <sect1><title>Resources Management</title> +!Ikernel/resource.c +!Ekernel/resource.c + </sect1> + + <sect1><title>MTRR Handling</title> +!Earch/x86/kernel/cpu/mtrr/main.c + </sect1> + + <sect1><title>PCI Support Library</title> +!Edrivers/pci/pci.c +!Edrivers/pci/pci-driver.c +!Edrivers/pci/remove.c +!Edrivers/pci/pci-acpi.c +!Edrivers/pci/search.c +!Edrivers/pci/msi.c +!Edrivers/pci/bus.c +<!-- FIXME: Removed for now since no structured comments in source +X!Edrivers/pci/hotplug.c +--> +!Edrivers/pci/probe.c +!Edrivers/pci/rom.c + </sect1> + <sect1><title>PCI Hotplug Support Library</title> +!Edrivers/pci/hotplug/pci_hotplug_core.c + </sect1> + <sect1><title>MCA Architecture</title> + <sect2><title>MCA Device Functions</title> + <para> + Refer to the file arch/x86/kernel/mca_32.c for more information. + </para> +<!-- FIXME: Removed for now since no structured comments in source +X!Earch/x86/kernel/mca_32.c +--> + </sect2> + <sect2><title>MCA Bus DMA</title> +!Iarch/x86/include/asm/mca_dma.h + </sect2> + </sect1> + </chapter> + + <chapter id="firmware"> + <title>Firmware Interfaces</title> + <sect1><title>DMI Interfaces</title> +!Edrivers/firmware/dmi_scan.c + </sect1> + <sect1><title>EDD Interfaces</title> +!Idrivers/firmware/edd.c + </sect1> + </chapter> + + <chapter id="security"> + <title>Security Framework</title> +!Isecurity/security.c +!Esecurity/inode.c + </chapter> + + <chapter id="audit"> + <title>Audit Interfaces</title> +!Ekernel/audit.c +!Ikernel/auditsc.c +!Ikernel/auditfilter.c + </chapter> + + <chapter id="accounting"> + <title>Accounting Framework</title> +!Ikernel/acct.c + </chapter> + + <chapter id="devdrivers"> + <title>Device drivers infrastructure</title> + <sect1><title>Device Drivers Base</title> +<!-- +X!Iinclude/linux/device.h +--> +!Edrivers/base/driver.c +!Edrivers/base/core.c +!Edrivers/base/class.c +!Edrivers/base/firmware_class.c +!Edrivers/base/transport_class.c +<!-- Cannot be included, because + attribute_container_add_class_device_adapter + and attribute_container_classdev_to_container + exceed allowed 44 characters maximum +X!Edrivers/base/attribute_container.c +--> +!Edrivers/base/sys.c +<!-- +X!Edrivers/base/interface.c +--> +!Edrivers/base/platform.c +!Edrivers/base/bus.c + </sect1> + <sect1><title>Device Drivers Power Management</title> +!Edrivers/base/power/main.c + </sect1> + <sect1><title>Device Drivers ACPI Support</title> +<!-- Internal functions only +X!Edrivers/acpi/sleep/main.c +X!Edrivers/acpi/sleep/wakeup.c +X!Edrivers/acpi/motherboard.c +X!Edrivers/acpi/bus.c +--> +!Edrivers/acpi/scan.c +!Idrivers/acpi/scan.c +<!-- No correct structured comments +X!Edrivers/acpi/pci_bind.c +--> + </sect1> + <sect1><title>Device drivers PnP support</title> +!Idrivers/pnp/core.c +<!-- No correct structured comments +X!Edrivers/pnp/system.c + --> +!Edrivers/pnp/card.c +!Idrivers/pnp/driver.c +!Edrivers/pnp/manager.c +!Edrivers/pnp/support.c + </sect1> + <sect1><title>Userspace IO devices</title> +!Edrivers/uio/uio.c +!Iinclude/linux/uio_driver.h + </sect1> + </chapter> + + <chapter id="blkdev"> + <title>Block Devices</title> +!Eblock/blk-core.c +!Iblock/blk-core.c +!Eblock/blk-map.c +!Iblock/blk-sysfs.c +!Eblock/blk-settings.c +!Eblock/blk-exec.c +!Eblock/blk-barrier.c +!Eblock/blk-tag.c +!Iblock/blk-tag.c +!Eblock/blk-integrity.c +!Iblock/blktrace.c +!Iblock/genhd.c +!Eblock/genhd.c + </chapter> + + <chapter id="chrdev"> + <title>Char devices</title> +!Efs/char_dev.c + </chapter> + + <chapter id="miscdev"> + <title>Miscellaneous Devices</title> +!Edrivers/char/misc.c + </chapter> + + <chapter id="parportdev"> + <title>Parallel Port Devices</title> +!Iinclude/linux/parport.h +!Edrivers/parport/ieee1284.c +!Edrivers/parport/share.c +!Idrivers/parport/daisy.c + </chapter> + + <chapter id="message_devices"> + <title>Message-based devices</title> + <sect1><title>Fusion message devices</title> +!Edrivers/message/fusion/mptbase.c +!Idrivers/message/fusion/mptbase.c +!Edrivers/message/fusion/mptscsih.c +!Idrivers/message/fusion/mptscsih.c +!Idrivers/message/fusion/mptctl.c +!Idrivers/message/fusion/mptspi.c +!Idrivers/message/fusion/mptfc.c +!Idrivers/message/fusion/mptlan.c + </sect1> + <sect1><title>I2O message devices</title> +!Iinclude/linux/i2o.h +!Idrivers/message/i2o/core.h +!Edrivers/message/i2o/iop.c +!Idrivers/message/i2o/iop.c +!Idrivers/message/i2o/config-osm.c +!Edrivers/message/i2o/exec-osm.c +!Idrivers/message/i2o/exec-osm.c +!Idrivers/message/i2o/bus-osm.c +!Edrivers/message/i2o/device.c +!Idrivers/message/i2o/device.c +!Idrivers/message/i2o/driver.c +!Idrivers/message/i2o/pci.c +!Idrivers/message/i2o/i2o_block.c +!Idrivers/message/i2o/i2o_scsi.c +!Idrivers/message/i2o/i2o_proc.c + </sect1> + </chapter> + + <chapter id="snddev"> + <title>Sound Devices</title> +!Iinclude/sound/core.h +!Esound/sound_core.c +!Iinclude/sound/pcm.h +!Esound/core/pcm.c +!Esound/core/device.c +!Esound/core/info.c +!Esound/core/rawmidi.c +!Esound/core/sound.c +!Esound/core/memory.c +!Esound/core/pcm_memory.c +!Esound/core/init.c +!Esound/core/isadma.c +!Esound/core/control.c +!Esound/core/pcm_lib.c +!Esound/core/hwdep.c +!Esound/core/pcm_native.c +!Esound/core/memalloc.c +<!-- FIXME: Removed for now since no structured comments in source +X!Isound/sound_firmware.c +--> + </chapter> + + <chapter id="uart16x50"> + <title>16x50 UART Driver</title> +!Iinclude/linux/serial_core.h +!Edrivers/serial/serial_core.c +!Edrivers/serial/8250.c + </chapter> + + <chapter id="fbdev"> + <title>Frame Buffer Library</title> + + <para> + The frame buffer drivers depend heavily on four data structures. + These structures are declared in include/linux/fb.h. They are + fb_info, fb_var_screeninfo, fb_fix_screeninfo and fb_monospecs. + The last three can be made available to and from userland. + </para> + + <para> + fb_info defines the current state of a particular video card. + Inside fb_info, there exists a fb_ops structure which is a + collection of needed functions to make fbdev and fbcon work. + fb_info is only visible to the kernel. + </para> + + <para> + fb_var_screeninfo is used to describe the features of a video card + that are user defined. With fb_var_screeninfo, things such as + depth and the resolution may be defined. + </para> + + <para> + The next structure is fb_fix_screeninfo. This defines the + properties of a card that are created when a mode is set and can't + be changed otherwise. A good example of this is the start of the + frame buffer memory. This "locks" the address of the frame buffer + memory, so that it cannot be changed or moved. + </para> + + <para> + The last structure is fb_monospecs. In the old API, there was + little importance for fb_monospecs. This allowed for forbidden things + such as setting a mode of 800x600 on a fix frequency monitor. With + the new API, fb_monospecs prevents such things, and if used + correctly, can prevent a monitor from being cooked. fb_monospecs + will not be useful until kernels 2.5.x. + </para> + + <sect1><title>Frame Buffer Memory</title> +!Edrivers/video/fbmem.c + </sect1> +<!-- + <sect1><title>Frame Buffer Console</title> +X!Edrivers/video/console/fbcon.c + </sect1> +--> + <sect1><title>Frame Buffer Colormap</title> +!Edrivers/video/fbcmap.c + </sect1> +<!-- FIXME: + drivers/video/fbgen.c has no docs, which stuffs up the sgml. Comment + out until somebody adds docs. KAO + <sect1><title>Frame Buffer Generic Functions</title> +X!Idrivers/video/fbgen.c + </sect1> +KAO --> + <sect1><title>Frame Buffer Video Mode Database</title> +!Idrivers/video/modedb.c +!Edrivers/video/modedb.c + </sect1> + <sect1><title>Frame Buffer Macintosh Video Mode Database</title> +!Edrivers/video/macmodes.c + </sect1> + <sect1><title>Frame Buffer Fonts</title> + <para> + Refer to the file drivers/video/console/fonts.c for more information. + </para> +<!-- FIXME: Removed for now since no structured comments in source +X!Idrivers/video/console/fonts.c +--> + </sect1> + </chapter> + + <chapter id="input_subsystem"> + <title>Input Subsystem</title> +!Iinclude/linux/input.h +!Edrivers/input/input.c +!Edrivers/input/ff-core.c +!Edrivers/input/ff-memless.c + </chapter> + + <chapter id="spi"> + <title>Serial Peripheral Interface (SPI)</title> + <para> + SPI is the "Serial Peripheral Interface", widely used with + embedded systems because it is a simple and efficient + interface: basically a multiplexed shift register. + Its three signal wires hold a clock (SCK, often in the range + of 1-20 MHz), a "Master Out, Slave In" (MOSI) data line, and + a "Master In, Slave Out" (MISO) data line. + SPI is a full duplex protocol; for each bit shifted out the + MOSI line (one per clock) another is shifted in on the MISO line. + Those bits are assembled into words of various sizes on the + way to and from system memory. + An additional chipselect line is usually active-low (nCS); + four signals are normally used for each peripheral, plus + sometimes an interrupt. + </para> + <para> + The SPI bus facilities listed here provide a generalized + interface to declare SPI busses and devices, manage them + according to the standard Linux driver model, and perform + input/output operations. + At this time, only "master" side interfaces are supported, + where Linux talks to SPI peripherals and does not implement + such a peripheral itself. + (Interfaces to support implementing SPI slaves would + necessarily look different.) + </para> + <para> + The programming interface is structured around two kinds of driver, + and two kinds of device. + A "Controller Driver" abstracts the controller hardware, which may + be as simple as a set of GPIO pins or as complex as a pair of FIFOs + connected to dual DMA engines on the other side of the SPI shift + register (maximizing throughput). Such drivers bridge between + whatever bus they sit on (often the platform bus) and SPI, and + expose the SPI side of their device as a + <structname>struct spi_master</structname>. + SPI devices are children of that master, represented as a + <structname>struct spi_device</structname> and manufactured from + <structname>struct spi_board_info</structname> descriptors which + are usually provided by board-specific initialization code. + A <structname>struct spi_driver</structname> is called a + "Protocol Driver", and is bound to a spi_device using normal + driver model calls. + </para> + <para> + The I/O model is a set of queued messages. Protocol drivers + submit one or more <structname>struct spi_message</structname> + objects, which are processed and completed asynchronously. + (There are synchronous wrappers, however.) Messages are + built from one or more <structname>struct spi_transfer</structname> + objects, each of which wraps a full duplex SPI transfer. + A variety of protocol tweaking options are needed, because + different chips adopt very different policies for how they + use the bits transferred with SPI. + </para> +!Iinclude/linux/spi/spi.h +!Fdrivers/spi/spi.c spi_register_board_info +!Edrivers/spi/spi.c + </chapter> + + <chapter id="i2c"> + <title>I<superscript>2</superscript>C and SMBus Subsystem</title> + + <para> + I<superscript>2</superscript>C (or without fancy typography, "I2C") + is an acronym for the "Inter-IC" bus, a simple bus protocol which is + widely used where low data rate communications suffice. + Since it's also a licensed trademark, some vendors use another + name (such as "Two-Wire Interface", TWI) for the same bus. + I2C only needs two signals (SCL for clock, SDA for data), conserving + board real estate and minimizing signal quality issues. + Most I2C devices use seven bit addresses, and bus speeds of up + to 400 kHz; there's a high speed extension (3.4 MHz) that's not yet + found wide use. + I2C is a multi-master bus; open drain signaling is used to + arbitrate between masters, as well as to handshake and to + synchronize clocks from slower clients. + </para> + + <para> + The Linux I2C programming interfaces support only the master + side of bus interactions, not the slave side. + The programming interface is structured around two kinds of driver, + and two kinds of device. + An I2C "Adapter Driver" abstracts the controller hardware; it binds + to a physical device (perhaps a PCI device or platform_device) and + exposes a <structname>struct i2c_adapter</structname> representing + each I2C bus segment it manages. + On each I2C bus segment will be I2C devices represented by a + <structname>struct i2c_client</structname>. Those devices will + be bound to a <structname>struct i2c_driver</structname>, + which should follow the standard Linux driver model. + (At this writing, a legacy model is more widely used.) + There are functions to perform various I2C protocol operations; at + this writing all such functions are usable only from task context. + </para> + + <para> + The System Management Bus (SMBus) is a sibling protocol. Most SMBus + systems are also I2C conformant. The electrical constraints are + tighter for SMBus, and it standardizes particular protocol messages + and idioms. Controllers that support I2C can also support most + SMBus operations, but SMBus controllers don't support all the protocol + options that an I2C controller will. + There are functions to perform various SMBus protocol operations, + either using I2C primitives or by issuing SMBus commands to + i2c_adapter devices which don't support those I2C operations. + </para> + +!Iinclude/linux/i2c.h +!Fdrivers/i2c/i2c-boardinfo.c i2c_register_board_info +!Edrivers/i2c/i2c-core.c + </chapter> + + <chapter id="clk"> + <title>Clock Framework</title> + + <para> + The clock framework defines programming interfaces to support + software management of the system clock tree. + This framework is widely used with System-On-Chip (SOC) platforms + to support power management and various devices which may need + custom clock rates. + Note that these "clocks" don't relate to timekeeping or real + time clocks (RTCs), each of which have separate frameworks. + These <structname>struct clk</structname> instances may be used + to manage for example a 96 MHz signal that is used to shift bits + into and out of peripherals or busses, or otherwise trigger + synchronous state machine transitions in system hardware. + </para> + + <para> + Power management is supported by explicit software clock gating: + unused clocks are disabled, so the system doesn't waste power + changing the state of transistors that aren't in active use. + On some systems this may be backed by hardware clock gating, + where clocks are gated without being disabled in software. + Sections of chips that are powered but not clocked may be able + to retain their last state. + This low power state is often called a <emphasis>retention + mode</emphasis>. + This mode still incurs leakage currents, especially with finer + circuit geometries, but for CMOS circuits power is mostly used + by clocked state changes. + </para> + + <para> + Power-aware drivers only enable their clocks when the device + they manage is in active use. Also, system sleep states often + differ according to which clock domains are active: while a + "standby" state may allow wakeup from several active domains, a + "mem" (suspend-to-RAM) state may require a more wholesale shutdown + of clocks derived from higher speed PLLs and oscillators, limiting + the number of possible wakeup event sources. A driver's suspend + method may need to be aware of system-specific clock constraints + on the target sleep state. + </para> + + <para> + Some platforms support programmable clock generators. These + can be used by external chips of various kinds, such as other + CPUs, multimedia codecs, and devices with strict requirements + for interface clocking. + </para> + +!Iinclude/linux/clk.h + </chapter> + +</book> diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl new file mode 100644 index 0000000..a50d6cd --- /dev/null +++ b/Documentation/DocBook/kernel-hacking.tmpl @@ -0,0 +1,1327 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="lk-hacking-guide"> + <bookinfo> + <title>Unreliable Guide To Hacking The Linux Kernel</title> + + <authorgroup> + <author> + <firstname>Rusty</firstname> + <surname>Russell</surname> + <affiliation> + <address> + <email>rusty@rustcorp.com.au</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2005</year> + <holder>Rusty Russell</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + + <releaseinfo> + This is the first release of this document as part of the kernel tarball. + </releaseinfo> + + </bookinfo> + + <toc></toc> + + <chapter id="introduction"> + <title>Introduction</title> + <para> + Welcome, gentle reader, to Rusty's Remarkably Unreliable Guide to Linux + Kernel Hacking. This document describes the common routines and + general requirements for kernel code: its goal is to serve as a + primer for Linux kernel development for experienced C + programmers. I avoid implementation details: that's what the + code is for, and I ignore whole tracts of useful routines. + </para> + <para> + Before you read this, please understand that I never wanted to + write this document, being grossly under-qualified, but I always + wanted to read it, and this was the only way. I hope it will + grow into a compendium of best practice, common starting points + and random information. + </para> + </chapter> + + <chapter id="basic-players"> + <title>The Players</title> + + <para> + At any time each of the CPUs in a system can be: + </para> + + <itemizedlist> + <listitem> + <para> + not associated with any process, serving a hardware interrupt; + </para> + </listitem> + + <listitem> + <para> + not associated with any process, serving a softirq or tasklet; + </para> + </listitem> + + <listitem> + <para> + running in kernel space, associated with a process (user context); + </para> + </listitem> + + <listitem> + <para> + running a process in user space. + </para> + </listitem> + </itemizedlist> + + <para> + There is an ordering between these. The bottom two can preempt + each other, but above that is a strict hierarchy: each can only be + preempted by the ones above it. For example, while a softirq is + running on a CPU, no other softirq will preempt it, but a hardware + interrupt can. However, any other CPUs in the system execute + independently. + </para> + + <para> + We'll see a number of ways that the user context can block + interrupts, to become truly non-preemptable. + </para> + + <sect1 id="basics-usercontext"> + <title>User Context</title> + + <para> + User context is when you are coming in from a system call or other + trap: like userspace, you can be preempted by more important tasks + and by interrupts. You can sleep, by calling + <function>schedule()</function>. + </para> + + <note> + <para> + You are always in user context on module load and unload, + and on operations on the block device layer. + </para> + </note> + + <para> + In user context, the <varname>current</varname> pointer (indicating + the task we are currently executing) is valid, and + <function>in_interrupt()</function> + (<filename>include/linux/interrupt.h</filename>) is <returnvalue>false + </returnvalue>. + </para> + + <caution> + <para> + Beware that if you have preemption or softirqs disabled + (see below), <function>in_interrupt()</function> will return a + false positive. + </para> + </caution> + </sect1> + + <sect1 id="basics-hardirqs"> + <title>Hardware Interrupts (Hard IRQs)</title> + + <para> + Timer ticks, <hardware>network cards</hardware> and + <hardware>keyboard</hardware> are examples of real + hardware which produce interrupts at any time. The kernel runs + interrupt handlers, which services the hardware. The kernel + guarantees that this handler is never re-entered: if the same + interrupt arrives, it is queued (or dropped). Because it + disables interrupts, this handler has to be fast: frequently it + simply acknowledges the interrupt, marks a 'software interrupt' + for execution and exits. + </para> + + <para> + You can tell you are in a hardware interrupt, because + <function>in_irq()</function> returns <returnvalue>true</returnvalue>. + </para> + <caution> + <para> + Beware that this will return a false positive if interrupts are disabled + (see below). + </para> + </caution> + </sect1> + + <sect1 id="basics-softirqs"> + <title>Software Interrupt Context: Softirqs and Tasklets</title> + + <para> + Whenever a system call is about to return to userspace, or a + hardware interrupt handler exits, any 'software interrupts' + which are marked pending (usually by hardware interrupts) are + run (<filename>kernel/softirq.c</filename>). + </para> + + <para> + Much of the real interrupt handling work is done here. Early in + the transition to <acronym>SMP</acronym>, there were only 'bottom + halves' (BHs), which didn't take advantage of multiple CPUs. Shortly + after we switched from wind-up computers made of match-sticks and snot, + we abandoned this limitation and switched to 'softirqs'. + </para> + + <para> + <filename class="headerfile">include/linux/interrupt.h</filename> lists the + different softirqs. A very important softirq is the + timer softirq (<filename + class="headerfile">include/linux/timer.h</filename>): you can + register to have it call functions for you in a given length of + time. + </para> + + <para> + Softirqs are often a pain to deal with, since the same softirq + will run simultaneously on more than one CPU. For this reason, + tasklets (<filename + class="headerfile">include/linux/interrupt.h</filename>) are more + often used: they are dynamically-registrable (meaning you can have + as many as you want), and they also guarantee that any tasklet + will only run on one CPU at any time, although different tasklets + can run simultaneously. + </para> + <caution> + <para> + The name 'tasklet' is misleading: they have nothing to do with 'tasks', + and probably more to do with some bad vodka Alexey Kuznetsov had at the + time. + </para> + </caution> + + <para> + You can tell you are in a softirq (or tasklet) + using the <function>in_softirq()</function> macro + (<filename class="headerfile">include/linux/interrupt.h</filename>). + </para> + <caution> + <para> + Beware that this will return a false positive if a bh lock (see below) + is held. + </para> + </caution> + </sect1> + </chapter> + + <chapter id="basic-rules"> + <title>Some Basic Rules</title> + + <variablelist> + <varlistentry> + <term>No memory protection</term> + <listitem> + <para> + If you corrupt memory, whether in user context or + interrupt context, the whole machine will crash. Are you + sure you can't do what you want in userspace? + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term>No floating point or <acronym>MMX</acronym></term> + <listitem> + <para> + The <acronym>FPU</acronym> context is not saved; even in user + context the <acronym>FPU</acronym> state probably won't + correspond with the current process: you would mess with some + user process' <acronym>FPU</acronym> state. If you really want + to do this, you would have to explicitly save/restore the full + <acronym>FPU</acronym> state (and avoid context switches). It + is generally a bad idea; use fixed point arithmetic first. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term>A rigid stack limit</term> + <listitem> + <para> + Depending on configuration options the kernel stack is about 3K to 6K for most 32-bit architectures: it's + about 14K on most 64-bit archs, and often shared with interrupts + so you can't use it all. Avoid deep recursion and huge local + arrays on the stack (allocate them dynamically instead). + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term>The Linux kernel is portable</term> + <listitem> + <para> + Let's keep it that way. Your code should be 64-bit clean, + and endian-independent. You should also minimize CPU + specific stuff, e.g. inline assembly should be cleanly + encapsulated and minimized to ease porting. Generally it + should be restricted to the architecture-dependent part of + the kernel tree. + </para> + </listitem> + </varlistentry> + </variablelist> + </chapter> + + <chapter id="ioctls"> + <title>ioctls: Not writing a new system call</title> + + <para> + A system call generally looks like this + </para> + + <programlisting> +asmlinkage long sys_mycall(int arg) +{ + return 0; +} + </programlisting> + + <para> + First, in most cases you don't want to create a new system call. + You create a character device and implement an appropriate ioctl + for it. This is much more flexible than system calls, doesn't have + to be entered in every architecture's + <filename class="headerfile">include/asm/unistd.h</filename> and + <filename>arch/kernel/entry.S</filename> file, and is much more + likely to be accepted by Linus. + </para> + + <para> + If all your routine does is read or write some parameter, consider + implementing a <function>sysfs</function> interface instead. + </para> + + <para> + Inside the ioctl you're in user context to a process. When a + error occurs you return a negated errno (see + <filename class="headerfile">include/linux/errno.h</filename>), + otherwise you return <returnvalue>0</returnvalue>. + </para> + + <para> + After you slept you should check if a signal occurred: the + Unix/Linux way of handling signals is to temporarily exit the + system call with the <constant>-ERESTARTSYS</constant> error. The + system call entry code will switch back to user context, process + the signal handler and then your system call will be restarted + (unless the user disabled that). So you should be prepared to + process the restart, e.g. if you're in the middle of manipulating + some data structure. + </para> + + <programlisting> +if (signal_pending()) + return -ERESTARTSYS; + </programlisting> + + <para> + If you're doing longer computations: first think userspace. If you + <emphasis>really</emphasis> want to do it in kernel you should + regularly check if you need to give up the CPU (remember there is + cooperative multitasking per CPU). Idiom: + </para> + + <programlisting> +cond_resched(); /* Will sleep */ + </programlisting> + + <para> + A short note on interface design: the UNIX system call motto is + "Provide mechanism not policy". + </para> + </chapter> + + <chapter id="deadlock-recipes"> + <title>Recipes for Deadlock</title> + + <para> + You cannot call any routines which may sleep, unless: + </para> + <itemizedlist> + <listitem> + <para> + You are in user context. + </para> + </listitem> + + <listitem> + <para> + You do not own any spinlocks. + </para> + </listitem> + + <listitem> + <para> + You have interrupts enabled (actually, Andi Kleen says + that the scheduling code will enable them for you, but + that's probably not what you wanted). + </para> + </listitem> + </itemizedlist> + + <para> + Note that some functions may sleep implicitly: common ones are + the user space access functions (*_user) and memory allocation + functions without <symbol>GFP_ATOMIC</symbol>. + </para> + + <para> + You should always compile your kernel + <symbol>CONFIG_DEBUG_SPINLOCK_SLEEP</symbol> on, and it will warn + you if you break these rules. If you <emphasis>do</emphasis> break + the rules, you will eventually lock up your box. + </para> + + <para> + Really. + </para> + </chapter> + + <chapter id="common-routines"> + <title>Common Routines</title> + + <sect1 id="routines-printk"> + <title> + <function>printk()</function> + <filename class="headerfile">include/linux/kernel.h</filename> + </title> + + <para> + <function>printk()</function> feeds kernel messages to the + console, dmesg, and the syslog daemon. It is useful for debugging + and reporting errors, and can be used inside interrupt context, + but use with caution: a machine which has its console flooded with + printk messages is unusable. It uses a format string mostly + compatible with ANSI C printf, and C string concatenation to give + it a first "priority" argument: + </para> + + <programlisting> +printk(KERN_INFO "i = %u\n", i); + </programlisting> + + <para> + See <filename class="headerfile">include/linux/kernel.h</filename>; + for other KERN_ values; these are interpreted by syslog as the + level. Special case: for printing an IP address use + </para> + + <programlisting> +__u32 ipaddress; +printk(KERN_INFO "my ip: %d.%d.%d.%d\n", NIPQUAD(ipaddress)); + </programlisting> + + <para> + <function>printk()</function> internally uses a 1K buffer and does + not catch overruns. Make sure that will be enough. + </para> + + <note> + <para> + You will know when you are a real kernel hacker + when you start typoing printf as printk in your user programs :) + </para> + </note> + + <!--- From the Lions book reader department --> + + <note> + <para> + Another sidenote: the original Unix Version 6 sources had a + comment on top of its printf function: "Printf should not be + used for chit-chat". You should follow that advice. + </para> + </note> + </sect1> + + <sect1 id="routines-copy"> + <title> + <function>copy_[to/from]_user()</function> + / + <function>get_user()</function> + / + <function>put_user()</function> + <filename class="headerfile">include/asm/uaccess.h</filename> + </title> + + <para> + <emphasis>[SLEEPS]</emphasis> + </para> + + <para> + <function>put_user()</function> and <function>get_user()</function> + are used to get and put single values (such as an int, char, or + long) from and to userspace. A pointer into userspace should + never be simply dereferenced: data should be copied using these + routines. Both return <constant>-EFAULT</constant> or 0. + </para> + <para> + <function>copy_to_user()</function> and + <function>copy_from_user()</function> are more general: they copy + an arbitrary amount of data to and from userspace. + <caution> + <para> + Unlike <function>put_user()</function> and + <function>get_user()</function>, they return the amount of + uncopied data (ie. <returnvalue>0</returnvalue> still means + success). + </para> + </caution> + [Yes, this moronic interface makes me cringe. The flamewar comes up every year or so. --RR.] + </para> + <para> + The functions may sleep implicitly. This should never be called + outside user context (it makes no sense), with interrupts + disabled, or a spinlock held. + </para> + </sect1> + + <sect1 id="routines-kmalloc"> + <title><function>kmalloc()</function>/<function>kfree()</function> + <filename class="headerfile">include/linux/slab.h</filename></title> + + <para> + <emphasis>[MAY SLEEP: SEE BELOW]</emphasis> + </para> + + <para> + These routines are used to dynamically request pointer-aligned + chunks of memory, like malloc and free do in userspace, but + <function>kmalloc()</function> takes an extra flag word. + Important values: + </para> + + <variablelist> + <varlistentry> + <term> + <constant> + GFP_KERNEL + </constant> + </term> + <listitem> + <para> + May sleep and swap to free memory. Only allowed in user + context, but is the most reliable way to allocate memory. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term> + <constant> + GFP_ATOMIC + </constant> + </term> + <listitem> + <para> + Don't sleep. Less reliable than <constant>GFP_KERNEL</constant>, + but may be called from interrupt context. You should + <emphasis>really</emphasis> have a good out-of-memory + error-handling strategy. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term> + <constant> + GFP_DMA + </constant> + </term> + <listitem> + <para> + Allocate ISA DMA lower than 16MB. If you don't know what that + is you don't need it. Very unreliable. + </para> + </listitem> + </varlistentry> + </variablelist> + + <para> + If you see a <errorname>sleeping function called from invalid + context</errorname> warning message, then maybe you called a + sleeping allocation function from interrupt context without + <constant>GFP_ATOMIC</constant>. You should really fix that. + Run, don't walk. + </para> + + <para> + If you are allocating at least <constant>PAGE_SIZE</constant> + (<filename class="headerfile">include/asm/page.h</filename>) bytes, + consider using <function>__get_free_pages()</function> + + (<filename class="headerfile">include/linux/mm.h</filename>). It + takes an order argument (0 for page sized, 1 for double page, 2 + for four pages etc.) and the same memory priority flag word as + above. + </para> + + <para> + If you are allocating more than a page worth of bytes you can use + <function>vmalloc()</function>. It'll allocate virtual memory in + the kernel map. This block is not contiguous in physical memory, + but the <acronym>MMU</acronym> makes it look like it is for you + (so it'll only look contiguous to the CPUs, not to external device + drivers). If you really need large physically contiguous memory + for some weird device, you have a problem: it is poorly supported + in Linux because after some time memory fragmentation in a running + kernel makes it hard. The best way is to allocate the block early + in the boot process via the <function>alloc_bootmem()</function> + routine. + </para> + + <para> + Before inventing your own cache of often-used objects consider + using a slab cache in + <filename class="headerfile">include/linux/slab.h</filename> + </para> + </sect1> + + <sect1 id="routines-current"> + <title><function>current</function> + <filename class="headerfile">include/asm/current.h</filename></title> + + <para> + This global variable (really a macro) contains a pointer to + the current task structure, so is only valid in user context. + For example, when a process makes a system call, this will + point to the task structure of the calling process. It is + <emphasis>not NULL</emphasis> in interrupt context. + </para> + </sect1> + + <sect1 id="routines-udelay"> + <title><function>mdelay()</function>/<function>udelay()</function> + <filename class="headerfile">include/asm/delay.h</filename> + <filename class="headerfile">include/linux/delay.h</filename> + </title> + + <para> + The <function>udelay()</function> and <function>ndelay()</function> functions can be used for small pauses. + Do not use large values with them as you risk + overflow - the helper function <function>mdelay()</function> is useful + here, or consider <function>msleep()</function>. + </para> + </sect1> + + <sect1 id="routines-endian"> + <title><function>cpu_to_be32()</function>/<function>be32_to_cpu()</function>/<function>cpu_to_le32()</function>/<function>le32_to_cpu()</function> + <filename class="headerfile">include/asm/byteorder.h</filename> + </title> + + <para> + The <function>cpu_to_be32()</function> family (where the "32" can + be replaced by 64 or 16, and the "be" can be replaced by "le") are + the general way to do endian conversions in the kernel: they + return the converted value. All variations supply the reverse as + well: <function>be32_to_cpu()</function>, etc. + </para> + + <para> + There are two major variations of these functions: the pointer + variation, such as <function>cpu_to_be32p()</function>, which take + a pointer to the given type, and return the converted value. The + other variation is the "in-situ" family, such as + <function>cpu_to_be32s()</function>, which convert value referred + to by the pointer, and return void. + </para> + </sect1> + + <sect1 id="routines-local-irqs"> + <title><function>local_irq_save()</function>/<function>local_irq_restore()</function> + <filename class="headerfile">include/asm/system.h</filename> + </title> + + <para> + These routines disable hard interrupts on the local CPU, and + restore them. They are reentrant; saving the previous state in + their one <varname>unsigned long flags</varname> argument. If you + know that interrupts are enabled, you can simply use + <function>local_irq_disable()</function> and + <function>local_irq_enable()</function>. + </para> + </sect1> + + <sect1 id="routines-softirqs"> + <title><function>local_bh_disable()</function>/<function>local_bh_enable()</function> + <filename class="headerfile">include/linux/interrupt.h</filename></title> + + <para> + These routines disable soft interrupts on the local CPU, and + restore them. They are reentrant; if soft interrupts were + disabled before, they will still be disabled after this pair + of functions has been called. They prevent softirqs and tasklets + from running on the current CPU. + </para> + </sect1> + + <sect1 id="routines-processorids"> + <title><function>smp_processor_id</function>() + <filename class="headerfile">include/asm/smp.h</filename></title> + + <para> + <function>get_cpu()</function> disables preemption (so you won't + suddenly get moved to another CPU) and returns the current + processor number, between 0 and <symbol>NR_CPUS</symbol>. Note + that the CPU numbers are not necessarily continuous. You return + it again with <function>put_cpu()</function> when you are done. + </para> + <para> + If you know you cannot be preempted by another task (ie. you are + in interrupt context, or have preemption disabled) you can use + smp_processor_id(). + </para> + </sect1> + + <sect1 id="routines-init"> + <title><type>__init</type>/<type>__exit</type>/<type>__initdata</type> + <filename class="headerfile">include/linux/init.h</filename></title> + + <para> + After boot, the kernel frees up a special section; functions + marked with <type>__init</type> and data structures marked with + <type>__initdata</type> are dropped after boot is complete: similarly + modules discard this memory after initialization. <type>__exit</type> + is used to declare a function which is only required on exit: the + function will be dropped if this file is not compiled as a module. + See the header file for use. Note that it makes no sense for a function + marked with <type>__init</type> to be exported to modules with + <function>EXPORT_SYMBOL()</function> - this will break. + </para> + + </sect1> + + <sect1 id="routines-init-again"> + <title><function>__initcall()</function>/<function>module_init()</function> + <filename class="headerfile">include/linux/init.h</filename></title> + <para> + Many parts of the kernel are well served as a module + (dynamically-loadable parts of the kernel). Using the + <function>module_init()</function> and + <function>module_exit()</function> macros it is easy to write code + without #ifdefs which can operate both as a module or built into + the kernel. + </para> + + <para> + The <function>module_init()</function> macro defines which + function is to be called at module insertion time (if the file is + compiled as a module), or at boot time: if the file is not + compiled as a module the <function>module_init()</function> macro + becomes equivalent to <function>__initcall()</function>, which + through linker magic ensures that the function is called on boot. + </para> + + <para> + The function can return a negative error number to cause + module loading to fail (unfortunately, this has no effect if + the module is compiled into the kernel). This function is + called in user context with interrupts enabled, so it can sleep. + </para> + </sect1> + + <sect1 id="routines-moduleexit"> + <title> <function>module_exit()</function> + <filename class="headerfile">include/linux/init.h</filename> </title> + + <para> + This macro defines the function to be called at module removal + time (or never, in the case of the file compiled into the + kernel). It will only be called if the module usage count has + reached zero. This function can also sleep, but cannot fail: + everything must be cleaned up by the time it returns. + </para> + + <para> + Note that this macro is optional: if it is not present, your + module will not be removable (except for 'rmmod -f'). + </para> + </sect1> + + <sect1 id="routines-module-use-counters"> + <title> <function>try_module_get()</function>/<function>module_put()</function> + <filename class="headerfile">include/linux/module.h</filename></title> + + <para> + These manipulate the module usage count, to protect against + removal (a module also can't be removed if another module uses one + of its exported symbols: see below). Before calling into module + code, you should call <function>try_module_get()</function> on + that module: if it fails, then the module is being removed and you + should act as if it wasn't there. Otherwise, you can safely enter + the module, and call <function>module_put()</function> when you're + finished. + </para> + + <para> + Most registerable structures have an + <structfield>owner</structfield> field, such as in the + <structname>file_operations</structname> structure. Set this field + to the macro <symbol>THIS_MODULE</symbol>. + </para> + </sect1> + + <!-- add info on new-style module refcounting here --> + </chapter> + + <chapter id="queues"> + <title>Wait Queues + <filename class="headerfile">include/linux/wait.h</filename> + </title> + <para> + <emphasis>[SLEEPS]</emphasis> + </para> + + <para> + A wait queue is used to wait for someone to wake you up when a + certain condition is true. They must be used carefully to ensure + there is no race condition. You declare a + <type>wait_queue_head_t</type>, and then processes which want to + wait for that condition declare a <type>wait_queue_t</type> + referring to themselves, and place that in the queue. + </para> + + <sect1 id="queue-declaring"> + <title>Declaring</title> + + <para> + You declare a <type>wait_queue_head_t</type> using the + <function>DECLARE_WAIT_QUEUE_HEAD()</function> macro, or using the + <function>init_waitqueue_head()</function> routine in your + initialization code. + </para> + </sect1> + + <sect1 id="queue-waitqueue"> + <title>Queuing</title> + + <para> + Placing yourself in the waitqueue is fairly complex, because you + must put yourself in the queue before checking the condition. + There is a macro to do this: + <function>wait_event_interruptible()</function> + + <filename class="headerfile">include/linux/wait.h</filename> The + first argument is the wait queue head, and the second is an + expression which is evaluated; the macro returns + <returnvalue>0</returnvalue> when this expression is true, or + <returnvalue>-ERESTARTSYS</returnvalue> if a signal is received. + The <function>wait_event()</function> version ignores signals. + </para> + <para> + Do not use the <function>sleep_on()</function> function family - + it is very easy to accidentally introduce races; almost certainly + one of the <function>wait_event()</function> family will do, or a + loop around <function>schedule_timeout()</function>. If you choose + to loop around <function>schedule_timeout()</function> remember + you must set the task state (with + <function>set_current_state()</function>) on each iteration to avoid + busy-looping. + </para> + + </sect1> + + <sect1 id="queue-waking"> + <title>Waking Up Queued Tasks</title> + + <para> + Call <function>wake_up()</function> + + <filename class="headerfile">include/linux/wait.h</filename>;, + which will wake up every process in the queue. The exception is + if one has <constant>TASK_EXCLUSIVE</constant> set, in which case + the remainder of the queue will not be woken. There are other variants + of this basic function available in the same header. + </para> + </sect1> + </chapter> + + <chapter id="atomic-ops"> + <title>Atomic Operations</title> + + <para> + Certain operations are guaranteed atomic on all platforms. The + first class of operations work on <type>atomic_t</type> + + <filename class="headerfile">include/asm/atomic.h</filename>; this + contains a signed integer (at least 32 bits long), and you must use + these functions to manipulate or read atomic_t variables. + <function>atomic_read()</function> and + <function>atomic_set()</function> get and set the counter, + <function>atomic_add()</function>, + <function>atomic_sub()</function>, + <function>atomic_inc()</function>, + <function>atomic_dec()</function>, and + <function>atomic_dec_and_test()</function> (returns + <returnvalue>true</returnvalue> if it was decremented to zero). + </para> + + <para> + Yes. It returns <returnvalue>true</returnvalue> (i.e. != 0) if the + atomic variable is zero. + </para> + + <para> + Note that these functions are slower than normal arithmetic, and + so should not be used unnecessarily. + </para> + + <para> + The second class of atomic operations is atomic bit operations on an + <type>unsigned long</type>, defined in + + <filename class="headerfile">include/linux/bitops.h</filename>. These + operations generally take a pointer to the bit pattern, and a bit + number: 0 is the least significant bit. + <function>set_bit()</function>, <function>clear_bit()</function> + and <function>change_bit()</function> set, clear, and flip the + given bit. <function>test_and_set_bit()</function>, + <function>test_and_clear_bit()</function> and + <function>test_and_change_bit()</function> do the same thing, + except return true if the bit was previously set; these are + particularly useful for atomically setting flags. + </para> + + <para> + It is possible to call these operations with bit indices greater + than BITS_PER_LONG. The resulting behavior is strange on big-endian + platforms though so it is a good idea not to do this. + </para> + </chapter> + + <chapter id="symbols"> + <title>Symbols</title> + + <para> + Within the kernel proper, the normal linking rules apply + (ie. unless a symbol is declared to be file scope with the + <type>static</type> keyword, it can be used anywhere in the + kernel). However, for modules, a special exported symbol table is + kept which limits the entry points to the kernel proper. Modules + can also export symbols. + </para> + + <sect1 id="sym-exportsymbols"> + <title><function>EXPORT_SYMBOL()</function> + <filename class="headerfile">include/linux/module.h</filename></title> + + <para> + This is the classic method of exporting a symbol: dynamically + loaded modules will be able to use the symbol as normal. + </para> + </sect1> + + <sect1 id="sym-exportsymbols-gpl"> + <title><function>EXPORT_SYMBOL_GPL()</function> + <filename class="headerfile">include/linux/module.h</filename></title> + + <para> + Similar to <function>EXPORT_SYMBOL()</function> except that the + symbols exported by <function>EXPORT_SYMBOL_GPL()</function> can + only be seen by modules with a + <function>MODULE_LICENSE()</function> that specifies a GPL + compatible license. It implies that the function is considered + an internal implementation issue, and not really an interface. + </para> + </sect1> + </chapter> + + <chapter id="conventions"> + <title>Routines and Conventions</title> + + <sect1 id="conventions-doublelinkedlist"> + <title>Double-linked lists + <filename class="headerfile">include/linux/list.h</filename></title> + + <para> + There used to be three sets of linked-list routines in the kernel + headers, but this one is the winner. If you don't have some + particular pressing need for a single list, it's a good choice. + </para> + + <para> + In particular, <function>list_for_each_entry</function> is useful. + </para> + </sect1> + + <sect1 id="convention-returns"> + <title>Return Conventions</title> + + <para> + For code called in user context, it's very common to defy C + convention, and return <returnvalue>0</returnvalue> for success, + and a negative error number + (eg. <returnvalue>-EFAULT</returnvalue>) for failure. This can be + unintuitive at first, but it's fairly widespread in the kernel. + </para> + + <para> + Using <function>ERR_PTR()</function> + + <filename class="headerfile">include/linux/err.h</filename>; to + encode a negative error number into a pointer, and + <function>IS_ERR()</function> and <function>PTR_ERR()</function> + to get it back out again: avoids a separate pointer parameter for + the error number. Icky, but in a good way. + </para> + </sect1> + + <sect1 id="conventions-borkedcompile"> + <title>Breaking Compilation</title> + + <para> + Linus and the other developers sometimes change function or + structure names in development kernels; this is not done just to + keep everyone on their toes: it reflects a fundamental change + (eg. can no longer be called with interrupts on, or does extra + checks, or doesn't do checks which were caught before). Usually + this is accompanied by a fairly complete note to the linux-kernel + mailing list; search the archive. Simply doing a global replace + on the file usually makes things <emphasis>worse</emphasis>. + </para> + </sect1> + + <sect1 id="conventions-initialising"> + <title>Initializing structure members</title> + + <para> + The preferred method of initializing structures is to use + designated initialisers, as defined by ISO C99, eg: + </para> + <programlisting> +static struct block_device_operations opt_fops = { + .open = opt_open, + .release = opt_release, + .ioctl = opt_ioctl, + .check_media_change = opt_media_change, +}; + </programlisting> + <para> + This makes it easy to grep for, and makes it clear which + structure fields are set. You should do this because it looks + cool. + </para> + </sect1> + + <sect1 id="conventions-gnu-extns"> + <title>GNU Extensions</title> + + <para> + GNU Extensions are explicitly allowed in the Linux kernel. + Note that some of the more complex ones are not very well + supported, due to lack of general use, but the following are + considered standard (see the GCC info page section "C + Extensions" for more details - Yes, really the info page, the + man page is only a short summary of the stuff in info). + </para> + <itemizedlist> + <listitem> + <para> + Inline functions + </para> + </listitem> + <listitem> + <para> + Statement expressions (ie. the ({ and }) constructs). + </para> + </listitem> + <listitem> + <para> + Declaring attributes of a function / variable / type + (__attribute__) + </para> + </listitem> + <listitem> + <para> + typeof + </para> + </listitem> + <listitem> + <para> + Zero length arrays + </para> + </listitem> + <listitem> + <para> + Macro varargs + </para> + </listitem> + <listitem> + <para> + Arithmetic on void pointers + </para> + </listitem> + <listitem> + <para> + Non-Constant initializers + </para> + </listitem> + <listitem> + <para> + Assembler Instructions (not outside arch/ and include/asm/) + </para> + </listitem> + <listitem> + <para> + Function names as strings (__func__). + </para> + </listitem> + <listitem> + <para> + __builtin_constant_p() + </para> + </listitem> + </itemizedlist> + + <para> + Be wary when using long long in the kernel, the code gcc generates for + it is horrible and worse: division and multiplication does not work + on i386 because the GCC runtime functions for it are missing from + the kernel environment. + </para> + + <!-- FIXME: add a note about ANSI aliasing cleanness --> + </sect1> + + <sect1 id="conventions-cplusplus"> + <title>C++</title> + + <para> + Using C++ in the kernel is usually a bad idea, because the + kernel does not provide the necessary runtime environment + and the include files are not tested for it. It is still + possible, but not recommended. If you really want to do + this, forget about exceptions at least. + </para> + </sect1> + + <sect1 id="conventions-ifdef"> + <title>#if</title> + + <para> + It is generally considered cleaner to use macros in header files + (or at the top of .c files) to abstract away functions rather than + using `#if' pre-processor statements throughout the source code. + </para> + </sect1> + </chapter> + + <chapter id="submitting"> + <title>Putting Your Stuff in the Kernel</title> + + <para> + In order to get your stuff into shape for official inclusion, or + even to make a neat patch, there's administrative work to be + done: + </para> + <itemizedlist> + <listitem> + <para> + Figure out whose pond you've been pissing in. Look at the top of + the source files, inside the <filename>MAINTAINERS</filename> + file, and last of all in the <filename>CREDITS</filename> file. + You should coordinate with this person to make sure you're not + duplicating effort, or trying something that's already been + rejected. + </para> + + <para> + Make sure you put your name and EMail address at the top of + any files you create or mangle significantly. This is the + first place people will look when they find a bug, or when + <emphasis>they</emphasis> want to make a change. + </para> + </listitem> + + <listitem> + <para> + Usually you want a configuration option for your kernel hack. + Edit <filename>Kconfig</filename> in the appropriate directory. + The Config language is simple to use by cut and paste, and there's + complete documentation in + <filename>Documentation/kbuild/kconfig-language.txt</filename>. + </para> + + <para> + You may well want to make your CONFIG option only visible if + <symbol>CONFIG_EXPERIMENTAL</symbol> is enabled: this serves as a + warning to users. There many other fancy things you can do: see + the various <filename>Kconfig</filename> files for ideas. + </para> + + <para> + In your description of the option, make sure you address both the + expert user and the user who knows nothing about your feature. Mention + incompatibilities and issues here. <emphasis> Definitely + </emphasis> end your description with <quote> if in doubt, say N + </quote> (or, occasionally, `Y'); this is for people who have no + idea what you are talking about. + </para> + </listitem> + + <listitem> + <para> + Edit the <filename>Makefile</filename>: the CONFIG variables are + exported here so you can usually just add a "obj-$(CONFIG_xxx) += + xxx.o" line. The syntax is documented in + <filename>Documentation/kbuild/makefiles.txt</filename>. + </para> + </listitem> + + <listitem> + <para> + Put yourself in <filename>CREDITS</filename> if you've done + something noteworthy, usually beyond a single file (your name + should be at the top of the source files anyway). + <filename>MAINTAINERS</filename> means you want to be consulted + when changes are made to a subsystem, and hear about bugs; it + implies a more-than-passing commitment to some part of the code. + </para> + </listitem> + + <listitem> + <para> + Finally, don't forget to read <filename>Documentation/SubmittingPatches</filename> + and possibly <filename>Documentation/SubmittingDrivers</filename>. + </para> + </listitem> + </itemizedlist> + </chapter> + + <chapter id="cantrips"> + <title>Kernel Cantrips</title> + + <para> + Some favorites from browsing the source. Feel free to add to this + list. + </para> + + <para> + <filename>arch/x86/include/asm/delay.h:</filename> + </para> + <programlisting> +#define ndelay(n) (__builtin_constant_p(n) ? \ + ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ + __ndelay(n)) + </programlisting> + + <para> + <filename>include/linux/fs.h</filename>: + </para> + <programlisting> +/* + * Kernel pointers have redundant information, so we can use a + * scheme where we can return either an error code or a dentry + * pointer with the same return value. + * + * This should be a per-architecture thing, to allow different + * error and pointer decisions. + */ + #define ERR_PTR(err) ((void *)((long)(err))) + #define PTR_ERR(ptr) ((long)(ptr)) + #define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000)) +</programlisting> + + <para> + <filename>arch/x86/include/asm/uaccess_32.h:</filename> + </para> + + <programlisting> +#define copy_to_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + __constant_copy_to_user((to),(from),(n)) : \ + __generic_copy_to_user((to),(from),(n))) + </programlisting> + + <para> + <filename>arch/sparc/kernel/head.S:</filename> + </para> + + <programlisting> +/* + * Sun people can't spell worth damn. "compatability" indeed. + * At least we *know* we can't spell, and use a spell-checker. + */ + +/* Uh, actually Linus it is I who cannot spell. Too much murky + * Sparc assembly will do this to ya. + */ +C_LABEL(cputypvar): + .asciz "compatability" + +/* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */ + .align 4 +C_LABEL(cputypvar_sun4m): + .asciz "compatible" + </programlisting> + + <para> + <filename>arch/sparc/lib/checksum.S:</filename> + </para> + + <programlisting> + /* Sun, you just can't beat me, you just can't. Stop trying, + * give up. I'm serious, I am going to kick the living shit + * out of you, game over, lights out. + */ + </programlisting> + </chapter> + + <chapter id="credits"> + <title>Thanks</title> + + <para> + Thanks to Andi Kleen for the idea, answering my questions, fixing + my mistakes, filling content, etc. Philipp Rumpf for more spelling + and clarity fixes, and some excellent non-obvious points. Werner + Almesberger for giving me a great summary of + <function>disable_irq()</function>, and Jes Sorensen and Andrea + Arcangeli added caveats. Michael Elizabeth Chastain for checking + and adding to the Configure section. <!-- Rusty insisted on this + bit; I didn't do it! --> Telsa Gwynne for teaching me DocBook. + </para> + </chapter> +</book> + diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl new file mode 100644 index 0000000..084f6ad --- /dev/null +++ b/Documentation/DocBook/kernel-locking.tmpl @@ -0,0 +1,2143 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="LKLockingGuide"> + <bookinfo> + <title>Unreliable Guide To Locking</title> + + <authorgroup> + <author> + <firstname>Rusty</firstname> + <surname>Russell</surname> + <affiliation> + <address> + <email>rusty@rustcorp.com.au</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2003</year> + <holder>Rusty Russell</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + + <toc></toc> + <chapter id="intro"> + <title>Introduction</title> + <para> + Welcome, to Rusty's Remarkably Unreliable Guide to Kernel + Locking issues. This document describes the locking systems in + the Linux Kernel in 2.6. + </para> + <para> + With the wide availability of HyperThreading, and <firstterm + linkend="gloss-preemption">preemption </firstterm> in the Linux + Kernel, everyone hacking on the kernel needs to know the + fundamentals of concurrency and locking for + <firstterm linkend="gloss-smp"><acronym>SMP</acronym></firstterm>. + </para> + </chapter> + + <chapter id="races"> + <title>The Problem With Concurrency</title> + <para> + (Skip this if you know what a Race Condition is). + </para> + <para> + In a normal program, you can increment a counter like so: + </para> + <programlisting> + very_important_count++; + </programlisting> + + <para> + This is what they would expect to happen: + </para> + + <table> + <title>Expected Results</title> + + <tgroup cols="2" align="left"> + + <thead> + <row> + <entry>Instance 1</entry> + <entry>Instance 2</entry> + </row> + </thead> + + <tbody> + <row> + <entry>read very_important_count (5)</entry> + <entry></entry> + </row> + <row> + <entry>add 1 (6)</entry> + <entry></entry> + </row> + <row> + <entry>write very_important_count (6)</entry> + <entry></entry> + </row> + <row> + <entry></entry> + <entry>read very_important_count (6)</entry> + </row> + <row> + <entry></entry> + <entry>add 1 (7)</entry> + </row> + <row> + <entry></entry> + <entry>write very_important_count (7)</entry> + </row> + </tbody> + + </tgroup> + </table> + + <para> + This is what might happen: + </para> + + <table> + <title>Possible Results</title> + + <tgroup cols="2" align="left"> + <thead> + <row> + <entry>Instance 1</entry> + <entry>Instance 2</entry> + </row> + </thead> + + <tbody> + <row> + <entry>read very_important_count (5)</entry> + <entry></entry> + </row> + <row> + <entry></entry> + <entry>read very_important_count (5)</entry> + </row> + <row> + <entry>add 1 (6)</entry> + <entry></entry> + </row> + <row> + <entry></entry> + <entry>add 1 (6)</entry> + </row> + <row> + <entry>write very_important_count (6)</entry> + <entry></entry> + </row> + <row> + <entry></entry> + <entry>write very_important_count (6)</entry> + </row> + </tbody> + </tgroup> + </table> + + <sect1 id="race-condition"> + <title>Race Conditions and Critical Regions</title> + <para> + This overlap, where the result depends on the + relative timing of multiple tasks, is called a <firstterm>race condition</firstterm>. + The piece of code containing the concurrency issue is called a + <firstterm>critical region</firstterm>. And especially since Linux starting running + on SMP machines, they became one of the major issues in kernel + design and implementation. + </para> + <para> + Preemption can have the same effect, even if there is only one + CPU: by preempting one task during the critical region, we have + exactly the same race condition. In this case the thread which + preempts might run the critical region itself. + </para> + <para> + The solution is to recognize when these simultaneous accesses + occur, and use locks to make sure that only one instance can + enter the critical region at any time. There are many + friendly primitives in the Linux kernel to help you do this. + And then there are the unfriendly primitives, but I'll pretend + they don't exist. + </para> + </sect1> + </chapter> + + <chapter id="locks"> + <title>Locking in the Linux Kernel</title> + + <para> + If I could give you one piece of advice: never sleep with anyone + crazier than yourself. But if I had to give you advice on + locking: <emphasis>keep it simple</emphasis>. + </para> + + <para> + Be reluctant to introduce new locks. + </para> + + <para> + Strangely enough, this last one is the exact reverse of my advice when + you <emphasis>have</emphasis> slept with someone crazier than yourself. + And you should think about getting a big dog. + </para> + + <sect1 id="lock-intro"> + <title>Two Main Types of Kernel Locks: Spinlocks and Mutexes</title> + + <para> + There are two main types of kernel locks. The fundamental type + is the spinlock + (<filename class="headerfile">include/asm/spinlock.h</filename>), + which is a very simple single-holder lock: if you can't get the + spinlock, you keep trying (spinning) until you can. Spinlocks are + very small and fast, and can be used anywhere. + </para> + <para> + The second type is a mutex + (<filename class="headerfile">include/linux/mutex.h</filename>): it + is like a spinlock, but you may block holding a mutex. + If you can't lock a mutex, your task will suspend itself, and be woken + up when the mutex is released. This means the CPU can do something + else while you are waiting. There are many cases when you simply + can't sleep (see <xref linkend="sleeping-things"/>), and so have to + use a spinlock instead. + </para> + <para> + Neither type of lock is recursive: see + <xref linkend="deadlock"/>. + </para> + </sect1> + + <sect1 id="uniprocessor"> + <title>Locks and Uniprocessor Kernels</title> + + <para> + For kernels compiled without <symbol>CONFIG_SMP</symbol>, and + without <symbol>CONFIG_PREEMPT</symbol> spinlocks do not exist at + all. This is an excellent design decision: when no-one else can + run at the same time, there is no reason to have a lock. + </para> + + <para> + If the kernel is compiled without <symbol>CONFIG_SMP</symbol>, + but <symbol>CONFIG_PREEMPT</symbol> is set, then spinlocks + simply disable preemption, which is sufficient to prevent any + races. For most purposes, we can think of preemption as + equivalent to SMP, and not worry about it separately. + </para> + + <para> + You should always test your locking code with <symbol>CONFIG_SMP</symbol> + and <symbol>CONFIG_PREEMPT</symbol> enabled, even if you don't have an SMP test box, because it + will still catch some kinds of locking bugs. + </para> + + <para> + Mutexes still exist, because they are required for + synchronization between <firstterm linkend="gloss-usercontext">user + contexts</firstterm>, as we will see below. + </para> + </sect1> + + <sect1 id="usercontextlocking"> + <title>Locking Only In User Context</title> + + <para> + If you have a data structure which is only ever accessed from + user context, then you can use a simple mutex + (<filename>include/linux/mutex.h</filename>) to protect it. This + is the most trivial case: you initialize the mutex. Then you can + call <function>mutex_lock_interruptible()</function> to grab the mutex, + and <function>mutex_unlock()</function> to release it. There is also a + <function>mutex_lock()</function>, which should be avoided, because it + will not return if a signal is received. + </para> + + <para> + Example: <filename>net/netfilter/nf_sockopt.c</filename> allows + registration of new <function>setsockopt()</function> and + <function>getsockopt()</function> calls, with + <function>nf_register_sockopt()</function>. Registration and + de-registration are only done on module load and unload (and boot + time, where there is no concurrency), and the list of registrations + is only consulted for an unknown <function>setsockopt()</function> + or <function>getsockopt()</function> system call. The + <varname>nf_sockopt_mutex</varname> is perfect to protect this, + especially since the setsockopt and getsockopt calls may well + sleep. + </para> + </sect1> + + <sect1 id="lock-user-bh"> + <title>Locking Between User Context and Softirqs</title> + + <para> + If a <firstterm linkend="gloss-softirq">softirq</firstterm> shares + data with user context, you have two problems. Firstly, the current + user context can be interrupted by a softirq, and secondly, the + critical region could be entered from another CPU. This is where + <function>spin_lock_bh()</function> + (<filename class="headerfile">include/linux/spinlock.h</filename>) is + used. It disables softirqs on that CPU, then grabs the lock. + <function>spin_unlock_bh()</function> does the reverse. (The + '_bh' suffix is a historical reference to "Bottom Halves", the + old name for software interrupts. It should really be + called spin_lock_softirq()' in a perfect world). + </para> + + <para> + Note that you can also use <function>spin_lock_irq()</function> + or <function>spin_lock_irqsave()</function> here, which stop + hardware interrupts as well: see <xref linkend="hardirq-context"/>. + </para> + + <para> + This works perfectly for <firstterm linkend="gloss-up"><acronym>UP + </acronym></firstterm> as well: the spin lock vanishes, and this macro + simply becomes <function>local_bh_disable()</function> + (<filename class="headerfile">include/linux/interrupt.h</filename>), which + protects you from the softirq being run. + </para> + </sect1> + + <sect1 id="lock-user-tasklet"> + <title>Locking Between User Context and Tasklets</title> + + <para> + This is exactly the same as above, because <firstterm + linkend="gloss-tasklet">tasklets</firstterm> are actually run + from a softirq. + </para> + </sect1> + + <sect1 id="lock-user-timers"> + <title>Locking Between User Context and Timers</title> + + <para> + This, too, is exactly the same as above, because <firstterm + linkend="gloss-timers">timers</firstterm> are actually run from + a softirq. From a locking point of view, tasklets and timers + are identical. + </para> + </sect1> + + <sect1 id="lock-tasklets"> + <title>Locking Between Tasklets/Timers</title> + + <para> + Sometimes a tasklet or timer might want to share data with + another tasklet or timer. + </para> + + <sect2 id="lock-tasklets-same"> + <title>The Same Tasklet/Timer</title> + <para> + Since a tasklet is never run on two CPUs at once, you don't + need to worry about your tasklet being reentrant (running + twice at once), even on SMP. + </para> + </sect2> + + <sect2 id="lock-tasklets-different"> + <title>Different Tasklets/Timers</title> + <para> + If another tasklet/timer wants + to share data with your tasklet or timer , you will both need to use + <function>spin_lock()</function> and + <function>spin_unlock()</function> calls. + <function>spin_lock_bh()</function> is + unnecessary here, as you are already in a tasklet, and + none will be run on the same CPU. + </para> + </sect2> + </sect1> + + <sect1 id="lock-softirqs"> + <title>Locking Between Softirqs</title> + + <para> + Often a softirq might + want to share data with itself or a tasklet/timer. + </para> + + <sect2 id="lock-softirqs-same"> + <title>The Same Softirq</title> + + <para> + The same softirq can run on the other CPUs: you can use a + per-CPU array (see <xref linkend="per-cpu"/>) for better + performance. If you're going so far as to use a softirq, + you probably care about scalable performance enough + to justify the extra complexity. + </para> + + <para> + You'll need to use <function>spin_lock()</function> and + <function>spin_unlock()</function> for shared data. + </para> + </sect2> + + <sect2 id="lock-softirqs-different"> + <title>Different Softirqs</title> + + <para> + You'll need to use <function>spin_lock()</function> and + <function>spin_unlock()</function> for shared data, whether it + be a timer, tasklet, different softirq or the same or another + softirq: any of them could be running on a different CPU. + </para> + </sect2> + </sect1> + </chapter> + + <chapter id="hardirq-context"> + <title>Hard IRQ Context</title> + + <para> + Hardware interrupts usually communicate with a + tasklet or softirq. Frequently this involves putting work in a + queue, which the softirq will take out. + </para> + + <sect1 id="hardirq-softirq"> + <title>Locking Between Hard IRQ and Softirqs/Tasklets</title> + + <para> + If a hardware irq handler shares data with a softirq, you have + two concerns. Firstly, the softirq processing can be + interrupted by a hardware interrupt, and secondly, the + critical region could be entered by a hardware interrupt on + another CPU. This is where <function>spin_lock_irq()</function> is + used. It is defined to disable interrupts on that cpu, then grab + the lock. <function>spin_unlock_irq()</function> does the reverse. + </para> + + <para> + The irq handler does not to use + <function>spin_lock_irq()</function>, because the softirq cannot + run while the irq handler is running: it can use + <function>spin_lock()</function>, which is slightly faster. The + only exception would be if a different hardware irq handler uses + the same lock: <function>spin_lock_irq()</function> will stop + that from interrupting us. + </para> + + <para> + This works perfectly for UP as well: the spin lock vanishes, + and this macro simply becomes <function>local_irq_disable()</function> + (<filename class="headerfile">include/asm/smp.h</filename>), which + protects you from the softirq/tasklet/BH being run. + </para> + + <para> + <function>spin_lock_irqsave()</function> + (<filename>include/linux/spinlock.h</filename>) is a variant + which saves whether interrupts were on or off in a flags word, + which is passed to <function>spin_unlock_irqrestore()</function>. This + means that the same code can be used inside an hard irq handler (where + interrupts are already off) and in softirqs (where the irq + disabling is required). + </para> + + <para> + Note that softirqs (and hence tasklets and timers) are run on + return from hardware interrupts, so + <function>spin_lock_irq()</function> also stops these. In that + sense, <function>spin_lock_irqsave()</function> is the most + general and powerful locking function. + </para> + + </sect1> + <sect1 id="hardirq-hardirq"> + <title>Locking Between Two Hard IRQ Handlers</title> + <para> + It is rare to have to share data between two IRQ handlers, but + if you do, <function>spin_lock_irqsave()</function> should be + used: it is architecture-specific whether all interrupts are + disabled inside irq handlers themselves. + </para> + </sect1> + + </chapter> + + <chapter id="cheatsheet"> + <title>Cheat Sheet For Locking</title> + <para> + Pete Zaitcev gives the following summary: + </para> + <itemizedlist> + <listitem> + <para> + If you are in a process context (any syscall) and want to + lock other process out, use a mutex. You can take a mutex + and sleep (<function>copy_from_user*(</function> or + <function>kmalloc(x,GFP_KERNEL)</function>). + </para> + </listitem> + <listitem> + <para> + Otherwise (== data can be touched in an interrupt), use + <function>spin_lock_irqsave()</function> and + <function>spin_unlock_irqrestore()</function>. + </para> + </listitem> + <listitem> + <para> + Avoid holding spinlock for more than 5 lines of code and + across any function call (except accessors like + <function>readb</function>). + </para> + </listitem> + </itemizedlist> + + <sect1 id="minimum-lock-reqirements"> + <title>Table of Minimum Requirements</title> + + <para> The following table lists the <emphasis>minimum</emphasis> + locking requirements between various contexts. In some cases, + the same context can only be running on one CPU at a time, so + no locking is required for that context (eg. a particular + thread can only run on one CPU at a time, but if it needs + shares data with another thread, locking is required). + </para> + <para> + Remember the advice above: you can always use + <function>spin_lock_irqsave()</function>, which is a superset + of all other spinlock primitives. + </para> + + <table> +<title>Table of Locking Requirements</title> +<tgroup cols="11"> +<tbody> + +<row> +<entry></entry> +<entry>IRQ Handler A</entry> +<entry>IRQ Handler B</entry> +<entry>Softirq A</entry> +<entry>Softirq B</entry> +<entry>Tasklet A</entry> +<entry>Tasklet B</entry> +<entry>Timer A</entry> +<entry>Timer B</entry> +<entry>User Context A</entry> +<entry>User Context B</entry> +</row> + +<row> +<entry>IRQ Handler A</entry> +<entry>None</entry> +</row> + +<row> +<entry>IRQ Handler B</entry> +<entry>SLIS</entry> +<entry>None</entry> +</row> + +<row> +<entry>Softirq A</entry> +<entry>SLI</entry> +<entry>SLI</entry> +<entry>SL</entry> +</row> + +<row> +<entry>Softirq B</entry> +<entry>SLI</entry> +<entry>SLI</entry> +<entry>SL</entry> +<entry>SL</entry> +</row> + +<row> +<entry>Tasklet A</entry> +<entry>SLI</entry> +<entry>SLI</entry> +<entry>SL</entry> +<entry>SL</entry> +<entry>None</entry> +</row> + +<row> +<entry>Tasklet B</entry> +<entry>SLI</entry> +<entry>SLI</entry> +<entry>SL</entry> +<entry>SL</entry> +<entry>SL</entry> +<entry>None</entry> +</row> + +<row> +<entry>Timer A</entry> +<entry>SLI</entry> +<entry>SLI</entry> +<entry>SL</entry> +<entry>SL</entry> +<entry>SL</entry> +<entry>SL</entry> +<entry>None</entry> +</row> + +<row> +<entry>Timer B</entry> +<entry>SLI</entry> +<entry>SLI</entry> +<entry>SL</entry> +<entry>SL</entry> +<entry>SL</entry> +<entry>SL</entry> +<entry>SL</entry> +<entry>None</entry> +</row> + +<row> +<entry>User Context A</entry> +<entry>SLI</entry> +<entry>SLI</entry> +<entry>SLBH</entry> +<entry>SLBH</entry> +<entry>SLBH</entry> +<entry>SLBH</entry> +<entry>SLBH</entry> +<entry>SLBH</entry> +<entry>None</entry> +</row> + +<row> +<entry>User Context B</entry> +<entry>SLI</entry> +<entry>SLI</entry> +<entry>SLBH</entry> +<entry>SLBH</entry> +<entry>SLBH</entry> +<entry>SLBH</entry> +<entry>SLBH</entry> +<entry>SLBH</entry> +<entry>MLI</entry> +<entry>None</entry> +</row> + +</tbody> +</tgroup> +</table> + + <table> +<title>Legend for Locking Requirements Table</title> +<tgroup cols="2"> +<tbody> + +<row> +<entry>SLIS</entry> +<entry>spin_lock_irqsave</entry> +</row> +<row> +<entry>SLI</entry> +<entry>spin_lock_irq</entry> +</row> +<row> +<entry>SL</entry> +<entry>spin_lock</entry> +</row> +<row> +<entry>SLBH</entry> +<entry>spin_lock_bh</entry> +</row> +<row> +<entry>MLI</entry> +<entry>mutex_lock_interruptible</entry> +</row> + +</tbody> +</tgroup> +</table> + +</sect1> +</chapter> + +<chapter id="trylock-functions"> + <title>The trylock Functions</title> + <para> + There are functions that try to acquire a lock only once and immediately + return a value telling about success or failure to acquire the lock. + They can be used if you need no access to the data protected with the lock + when some other thread is holding the lock. You should acquire the lock + later if you then need access to the data protected with the lock. + </para> + + <para> + <function>spin_trylock()</function> does not spin but returns non-zero if + it acquires the spinlock on the first try or 0 if not. This function can + be used in all contexts like <function>spin_lock</function>: you must have + disabled the contexts that might interrupt you and acquire the spin lock. + </para> + + <para> + <function>mutex_trylock()</function> does not suspend your task + but returns non-zero if it could lock the mutex on the first try + or 0 if not. This function cannot be safely used in hardware or software + interrupt contexts despite not sleeping. + </para> +</chapter> + + <chapter id="Examples"> + <title>Common Examples</title> + <para> +Let's step through a simple example: a cache of number to name +mappings. The cache keeps a count of how often each of the objects is +used, and when it gets full, throws out the least used one. + + </para> + + <sect1 id="examples-usercontext"> + <title>All In User Context</title> + <para> +For our first example, we assume that all operations are in user +context (ie. from system calls), so we can sleep. This means we can +use a mutex to protect the cache and all the objects within +it. Here's the code: + </para> + + <programlisting> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/mutex.h> +#include <asm/errno.h> + +struct object +{ + struct list_head list; + int id; + char name[32]; + int popularity; +}; + +/* Protects the cache, cache_num, and the objects within it */ +static DEFINE_MUTEX(cache_lock); +static LIST_HEAD(cache); +static unsigned int cache_num = 0; +#define MAX_CACHE_SIZE 10 + +/* Must be holding cache_lock */ +static struct object *__cache_find(int id) +{ + struct object *i; + + list_for_each_entry(i, &cache, list) + if (i->id == id) { + i->popularity++; + return i; + } + return NULL; +} + +/* Must be holding cache_lock */ +static void __cache_delete(struct object *obj) +{ + BUG_ON(!obj); + list_del(&obj->list); + kfree(obj); + cache_num--; +} + +/* Must be holding cache_lock */ +static void __cache_add(struct object *obj) +{ + list_add(&obj->list, &cache); + if (++cache_num > MAX_CACHE_SIZE) { + struct object *i, *outcast = NULL; + list_for_each_entry(i, &cache, list) { + if (!outcast || i->popularity < outcast->popularity) + outcast = i; + } + __cache_delete(outcast); + } +} + +int cache_add(int id, const char *name) +{ + struct object *obj; + + if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) + return -ENOMEM; + + strlcpy(obj->name, name, sizeof(obj->name)); + obj->id = id; + obj->popularity = 0; + + mutex_lock(&cache_lock); + __cache_add(obj); + mutex_unlock(&cache_lock); + return 0; +} + +void cache_delete(int id) +{ + mutex_lock(&cache_lock); + __cache_delete(__cache_find(id)); + mutex_unlock(&cache_lock); +} + +int cache_find(int id, char *name) +{ + struct object *obj; + int ret = -ENOENT; + + mutex_lock(&cache_lock); + obj = __cache_find(id); + if (obj) { + ret = 0; + strcpy(name, obj->name); + } + mutex_unlock(&cache_lock); + return ret; +} +</programlisting> + + <para> +Note that we always make sure we have the cache_lock when we add, +delete, or look up the cache: both the cache infrastructure itself and +the contents of the objects are protected by the lock. In this case +it's easy, since we copy the data for the user, and never let them +access the objects directly. + </para> + <para> +There is a slight (and common) optimization here: in +<function>cache_add</function> we set up the fields of the object +before grabbing the lock. This is safe, as no-one else can access it +until we put it in cache. + </para> + </sect1> + + <sect1 id="examples-interrupt"> + <title>Accessing From Interrupt Context</title> + <para> +Now consider the case where <function>cache_find</function> can be +called from interrupt context: either a hardware interrupt or a +softirq. An example would be a timer which deletes object from the +cache. + </para> + <para> +The change is shown below, in standard patch format: the +<symbol>-</symbol> are lines which are taken away, and the +<symbol>+</symbol> are lines which are added. + </para> +<programlisting> +--- cache.c.usercontext 2003-12-09 13:58:54.000000000 +1100 ++++ cache.c.interrupt 2003-12-09 14:07:49.000000000 +1100 +@@ -12,7 +12,7 @@ + int popularity; + }; + +-static DEFINE_MUTEX(cache_lock); ++static DEFINE_SPINLOCK(cache_lock); + static LIST_HEAD(cache); + static unsigned int cache_num = 0; + #define MAX_CACHE_SIZE 10 +@@ -55,6 +55,7 @@ + int cache_add(int id, const char *name) + { + struct object *obj; ++ unsigned long flags; + + if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) + return -ENOMEM; +@@ -63,30 +64,33 @@ + obj->id = id; + obj->popularity = 0; + +- mutex_lock(&cache_lock); ++ spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); +- mutex_unlock(&cache_lock); ++ spin_unlock_irqrestore(&cache_lock, flags); + return 0; + } + + void cache_delete(int id) + { +- mutex_lock(&cache_lock); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&cache_lock, flags); + __cache_delete(__cache_find(id)); +- mutex_unlock(&cache_lock); ++ spin_unlock_irqrestore(&cache_lock, flags); + } + + int cache_find(int id, char *name) + { + struct object *obj; + int ret = -ENOENT; ++ unsigned long flags; + +- mutex_lock(&cache_lock); ++ spin_lock_irqsave(&cache_lock, flags); + obj = __cache_find(id); + if (obj) { + ret = 0; + strcpy(name, obj->name); + } +- mutex_unlock(&cache_lock); ++ spin_unlock_irqrestore(&cache_lock, flags); + return ret; + } +</programlisting> + + <para> +Note that the <function>spin_lock_irqsave</function> will turn off +interrupts if they are on, otherwise does nothing (if we are already +in an interrupt handler), hence these functions are safe to call from +any context. + </para> + <para> +Unfortunately, <function>cache_add</function> calls +<function>kmalloc</function> with the <symbol>GFP_KERNEL</symbol> +flag, which is only legal in user context. I have assumed that +<function>cache_add</function> is still only called in user context, +otherwise this should become a parameter to +<function>cache_add</function>. + </para> + </sect1> + <sect1 id="examples-refcnt"> + <title>Exposing Objects Outside This File</title> + <para> +If our objects contained more information, it might not be sufficient +to copy the information in and out: other parts of the code might want +to keep pointers to these objects, for example, rather than looking up +the id every time. This produces two problems. + </para> + <para> +The first problem is that we use the <symbol>cache_lock</symbol> to +protect objects: we'd need to make this non-static so the rest of the +code can use it. This makes locking trickier, as it is no longer all +in one place. + </para> + <para> +The second problem is the lifetime problem: if another structure keeps +a pointer to an object, it presumably expects that pointer to remain +valid. Unfortunately, this is only guaranteed while you hold the +lock, otherwise someone might call <function>cache_delete</function> +and even worse, add another object, re-using the same address. + </para> + <para> +As there is only one lock, you can't hold it forever: no-one else would +get any work done. + </para> + <para> +The solution to this problem is to use a reference count: everyone who +has a pointer to the object increases it when they first get the +object, and drops the reference count when they're finished with it. +Whoever drops it to zero knows it is unused, and can actually delete it. + </para> + <para> +Here is the code: + </para> + +<programlisting> +--- cache.c.interrupt 2003-12-09 14:25:43.000000000 +1100 ++++ cache.c.refcnt 2003-12-09 14:33:05.000000000 +1100 +@@ -7,6 +7,7 @@ + struct object + { + struct list_head list; ++ unsigned int refcnt; + int id; + char name[32]; + int popularity; +@@ -17,6 +18,35 @@ + static unsigned int cache_num = 0; + #define MAX_CACHE_SIZE 10 + ++static void __object_put(struct object *obj) ++{ ++ if (--obj->refcnt == 0) ++ kfree(obj); ++} ++ ++static void __object_get(struct object *obj) ++{ ++ obj->refcnt++; ++} ++ ++void object_put(struct object *obj) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&cache_lock, flags); ++ __object_put(obj); ++ spin_unlock_irqrestore(&cache_lock, flags); ++} ++ ++void object_get(struct object *obj) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&cache_lock, flags); ++ __object_get(obj); ++ spin_unlock_irqrestore(&cache_lock, flags); ++} ++ + /* Must be holding cache_lock */ + static struct object *__cache_find(int id) + { +@@ -35,6 +65,7 @@ + { + BUG_ON(!obj); + list_del(&obj->list); ++ __object_put(obj); + cache_num--; + } + +@@ -63,6 +94,7 @@ + strlcpy(obj->name, name, sizeof(obj->name)); + obj->id = id; + obj->popularity = 0; ++ obj->refcnt = 1; /* The cache holds a reference */ + + spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); +@@ -79,18 +111,15 @@ + spin_unlock_irqrestore(&cache_lock, flags); + } + +-int cache_find(int id, char *name) ++struct object *cache_find(int id) + { + struct object *obj; +- int ret = -ENOENT; + unsigned long flags; + + spin_lock_irqsave(&cache_lock, flags); + obj = __cache_find(id); +- if (obj) { +- ret = 0; +- strcpy(name, obj->name); +- } ++ if (obj) ++ __object_get(obj); + spin_unlock_irqrestore(&cache_lock, flags); +- return ret; ++ return obj; + } +</programlisting> + +<para> +We encapsulate the reference counting in the standard 'get' and 'put' +functions. Now we can return the object itself from +<function>cache_find</function> which has the advantage that the user +can now sleep holding the object (eg. to +<function>copy_to_user</function> to name to userspace). +</para> +<para> +The other point to note is that I said a reference should be held for +every pointer to the object: thus the reference count is 1 when first +inserted into the cache. In some versions the framework does not hold +a reference count, but they are more complicated. +</para> + + <sect2 id="examples-refcnt-atomic"> + <title>Using Atomic Operations For The Reference Count</title> +<para> +In practice, <type>atomic_t</type> would usually be used for +<structfield>refcnt</structfield>. There are a number of atomic +operations defined in + +<filename class="headerfile">include/asm/atomic.h</filename>: these are +guaranteed to be seen atomically from all CPUs in the system, so no +lock is required. In this case, it is simpler than using spinlocks, +although for anything non-trivial using spinlocks is clearer. The +<function>atomic_inc</function> and +<function>atomic_dec_and_test</function> are used instead of the +standard increment and decrement operators, and the lock is no longer +used to protect the reference count itself. +</para> + +<programlisting> +--- cache.c.refcnt 2003-12-09 15:00:35.000000000 +1100 ++++ cache.c.refcnt-atomic 2003-12-11 15:49:42.000000000 +1100 +@@ -7,7 +7,7 @@ + struct object + { + struct list_head list; +- unsigned int refcnt; ++ atomic_t refcnt; + int id; + char name[32]; + int popularity; +@@ -18,33 +18,15 @@ + static unsigned int cache_num = 0; + #define MAX_CACHE_SIZE 10 + +-static void __object_put(struct object *obj) +-{ +- if (--obj->refcnt == 0) +- kfree(obj); +-} +- +-static void __object_get(struct object *obj) +-{ +- obj->refcnt++; +-} +- + void object_put(struct object *obj) + { +- unsigned long flags; +- +- spin_lock_irqsave(&cache_lock, flags); +- __object_put(obj); +- spin_unlock_irqrestore(&cache_lock, flags); ++ if (atomic_dec_and_test(&obj->refcnt)) ++ kfree(obj); + } + + void object_get(struct object *obj) + { +- unsigned long flags; +- +- spin_lock_irqsave(&cache_lock, flags); +- __object_get(obj); +- spin_unlock_irqrestore(&cache_lock, flags); ++ atomic_inc(&obj->refcnt); + } + + /* Must be holding cache_lock */ +@@ -65,7 +47,7 @@ + { + BUG_ON(!obj); + list_del(&obj->list); +- __object_put(obj); ++ object_put(obj); + cache_num--; + } + +@@ -94,7 +76,7 @@ + strlcpy(obj->name, name, sizeof(obj->name)); + obj->id = id; + obj->popularity = 0; +- obj->refcnt = 1; /* The cache holds a reference */ ++ atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ + + spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); +@@ -119,7 +101,7 @@ + spin_lock_irqsave(&cache_lock, flags); + obj = __cache_find(id); + if (obj) +- __object_get(obj); ++ object_get(obj); + spin_unlock_irqrestore(&cache_lock, flags); + return obj; + } +</programlisting> +</sect2> +</sect1> + + <sect1 id="examples-lock-per-obj"> + <title>Protecting The Objects Themselves</title> + <para> +In these examples, we assumed that the objects (except the reference +counts) never changed once they are created. If we wanted to allow +the name to change, there are three possibilities: + </para> + <itemizedlist> + <listitem> + <para> +You can make <symbol>cache_lock</symbol> non-static, and tell people +to grab that lock before changing the name in any object. + </para> + </listitem> + <listitem> + <para> +You can provide a <function>cache_obj_rename</function> which grabs +this lock and changes the name for the caller, and tell everyone to +use that function. + </para> + </listitem> + <listitem> + <para> +You can make the <symbol>cache_lock</symbol> protect only the cache +itself, and use another lock to protect the name. + </para> + </listitem> + </itemizedlist> + + <para> +Theoretically, you can make the locks as fine-grained as one lock for +every field, for every object. In practice, the most common variants +are: +</para> + <itemizedlist> + <listitem> + <para> +One lock which protects the infrastructure (the <symbol>cache</symbol> +list in this example) and all the objects. This is what we have done +so far. + </para> + </listitem> + <listitem> + <para> +One lock which protects the infrastructure (including the list +pointers inside the objects), and one lock inside the object which +protects the rest of that object. + </para> + </listitem> + <listitem> + <para> +Multiple locks to protect the infrastructure (eg. one lock per hash +chain), possibly with a separate per-object lock. + </para> + </listitem> + </itemizedlist> + +<para> +Here is the "lock-per-object" implementation: +</para> +<programlisting> +--- cache.c.refcnt-atomic 2003-12-11 15:50:54.000000000 +1100 ++++ cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100 +@@ -6,11 +6,17 @@ + + struct object + { ++ /* These two protected by cache_lock. */ + struct list_head list; ++ int popularity; ++ + atomic_t refcnt; ++ ++ /* Doesn't change once created. */ + int id; ++ ++ spinlock_t lock; /* Protects the name */ + char name[32]; +- int popularity; + }; + + static DEFINE_SPINLOCK(cache_lock); +@@ -77,6 +84,7 @@ + obj->id = id; + obj->popularity = 0; + atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ ++ spin_lock_init(&obj->lock); + + spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); +</programlisting> + +<para> +Note that I decide that the <structfield>popularity</structfield> +count should be protected by the <symbol>cache_lock</symbol> rather +than the per-object lock: this is because it (like the +<structname>struct list_head</structname> inside the object) is +logically part of the infrastructure. This way, I don't need to grab +the lock of every object in <function>__cache_add</function> when +seeking the least popular. +</para> + +<para> +I also decided that the <structfield>id</structfield> member is +unchangeable, so I don't need to grab each object lock in +<function>__cache_find()</function> to examine the +<structfield>id</structfield>: the object lock is only used by a +caller who wants to read or write the <structfield>name</structfield> +field. +</para> + +<para> +Note also that I added a comment describing what data was protected by +which locks. This is extremely important, as it describes the runtime +behavior of the code, and can be hard to gain from just reading. And +as Alan Cox says, <quote>Lock data, not code</quote>. +</para> +</sect1> +</chapter> + + <chapter id="common-problems"> + <title>Common Problems</title> + <sect1 id="deadlock"> + <title>Deadlock: Simple and Advanced</title> + + <para> + There is a coding bug where a piece of code tries to grab a + spinlock twice: it will spin forever, waiting for the lock to + be released (spinlocks, rwlocks and mutexes are not + recursive in Linux). This is trivial to diagnose: not a + stay-up-five-nights-talk-to-fluffy-code-bunnies kind of + problem. + </para> + + <para> + For a slightly more complex case, imagine you have a region + shared by a softirq and user context. If you use a + <function>spin_lock()</function> call to protect it, it is + possible that the user context will be interrupted by the softirq + while it holds the lock, and the softirq will then spin + forever trying to get the same lock. + </para> + + <para> + Both of these are called deadlock, and as shown above, it can + occur even with a single CPU (although not on UP compiles, + since spinlocks vanish on kernel compiles with + <symbol>CONFIG_SMP</symbol>=n. You'll still get data corruption + in the second example). + </para> + + <para> + This complete lockup is easy to diagnose: on SMP boxes the + watchdog timer or compiling with <symbol>DEBUG_SPINLOCK</symbol> set + (<filename>include/linux/spinlock.h</filename>) will show this up + immediately when it happens. + </para> + + <para> + A more complex problem is the so-called 'deadly embrace', + involving two or more locks. Say you have a hash table: each + entry in the table is a spinlock, and a chain of hashed + objects. Inside a softirq handler, you sometimes want to + alter an object from one place in the hash to another: you + grab the spinlock of the old hash chain and the spinlock of + the new hash chain, and delete the object from the old one, + and insert it in the new one. + </para> + + <para> + There are two problems here. First, if your code ever + tries to move the object to the same chain, it will deadlock + with itself as it tries to lock it twice. Secondly, if the + same softirq on another CPU is trying to move another object + in the reverse direction, the following could happen: + </para> + + <table> + <title>Consequences</title> + + <tgroup cols="2" align="left"> + + <thead> + <row> + <entry>CPU 1</entry> + <entry>CPU 2</entry> + </row> + </thead> + + <tbody> + <row> + <entry>Grab lock A -> OK</entry> + <entry>Grab lock B -> OK</entry> + </row> + <row> + <entry>Grab lock B -> spin</entry> + <entry>Grab lock A -> spin</entry> + </row> + </tbody> + </tgroup> + </table> + + <para> + The two CPUs will spin forever, waiting for the other to give up + their lock. It will look, smell, and feel like a crash. + </para> + </sect1> + + <sect1 id="techs-deadlock-prevent"> + <title>Preventing Deadlock</title> + + <para> + Textbooks will tell you that if you always lock in the same + order, you will never get this kind of deadlock. Practice + will tell you that this approach doesn't scale: when I + create a new lock, I don't understand enough of the kernel + to figure out where in the 5000 lock hierarchy it will fit. + </para> + + <para> + The best locks are encapsulated: they never get exposed in + headers, and are never held around calls to non-trivial + functions outside the same file. You can read through this + code and see that it will never deadlock, because it never + tries to grab another lock while it has that one. People + using your code don't even need to know you are using a + lock. + </para> + + <para> + A classic problem here is when you provide callbacks or + hooks: if you call these with the lock held, you risk simple + deadlock, or a deadly embrace (who knows what the callback + will do?). Remember, the other programmers are out to get + you, so don't do this. + </para> + + <sect2 id="techs-deadlock-overprevent"> + <title>Overzealous Prevention Of Deadlocks</title> + + <para> + Deadlocks are problematic, but not as bad as data + corruption. Code which grabs a read lock, searches a list, + fails to find what it wants, drops the read lock, grabs a + write lock and inserts the object has a race condition. + </para> + + <para> + If you don't see why, please stay the fuck away from my code. + </para> + </sect2> + </sect1> + + <sect1 id="racing-timers"> + <title>Racing Timers: A Kernel Pastime</title> + + <para> + Timers can produce their own special problems with races. + Consider a collection of objects (list, hash, etc) where each + object has a timer which is due to destroy it. + </para> + + <para> + If you want to destroy the entire collection (say on module + removal), you might do the following: + </para> + + <programlisting> + /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE + HUNGARIAN NOTATION */ + spin_lock_bh(&list_lock); + + while (list) { + struct foo *next = list->next; + del_timer(&list->timer); + kfree(list); + list = next; + } + + spin_unlock_bh(&list_lock); + </programlisting> + + <para> + Sooner or later, this will crash on SMP, because a timer can + have just gone off before the <function>spin_lock_bh()</function>, + and it will only get the lock after we + <function>spin_unlock_bh()</function>, and then try to free + the element (which has already been freed!). + </para> + + <para> + This can be avoided by checking the result of + <function>del_timer()</function>: if it returns + <returnvalue>1</returnvalue>, the timer has been deleted. + If <returnvalue>0</returnvalue>, it means (in this + case) that it is currently running, so we can do: + </para> + + <programlisting> + retry: + spin_lock_bh(&list_lock); + + while (list) { + struct foo *next = list->next; + if (!del_timer(&list->timer)) { + /* Give timer a chance to delete this */ + spin_unlock_bh(&list_lock); + goto retry; + } + kfree(list); + list = next; + } + + spin_unlock_bh(&list_lock); + </programlisting> + + <para> + Another common problem is deleting timers which restart + themselves (by calling <function>add_timer()</function> at the end + of their timer function). Because this is a fairly common case + which is prone to races, you should use <function>del_timer_sync()</function> + (<filename class="headerfile">include/linux/timer.h</filename>) + to handle this case. It returns the number of times the timer + had to be deleted before we finally stopped it from adding itself back + in. + </para> + </sect1> + + </chapter> + + <chapter id="Efficiency"> + <title>Locking Speed</title> + + <para> +There are three main things to worry about when considering speed of +some code which does locking. First is concurrency: how many things +are going to be waiting while someone else is holding a lock. Second +is the time taken to actually acquire and release an uncontended lock. +Third is using fewer, or smarter locks. I'm assuming that the lock is +used fairly often: otherwise, you wouldn't be concerned about +efficiency. +</para> + <para> +Concurrency depends on how long the lock is usually held: you should +hold the lock for as long as needed, but no longer. In the cache +example, we always create the object without the lock held, and then +grab the lock only when we are ready to insert it in the list. +</para> + <para> +Acquisition times depend on how much damage the lock operations do to +the pipeline (pipeline stalls) and how likely it is that this CPU was +the last one to grab the lock (ie. is the lock cache-hot for this +CPU): on a machine with more CPUs, this likelihood drops fast. +Consider a 700MHz Intel Pentium III: an instruction takes about 0.7ns, +an atomic increment takes about 58ns, a lock which is cache-hot on +this CPU takes 160ns, and a cacheline transfer from another CPU takes +an additional 170 to 360ns. (These figures from Paul McKenney's +<ulink url="http://www.linuxjournal.com/article.php?sid=6993"> Linux +Journal RCU article</ulink>). +</para> + <para> +These two aims conflict: holding a lock for a short time might be done +by splitting locks into parts (such as in our final per-object-lock +example), but this increases the number of lock acquisitions, and the +results are often slower than having a single lock. This is another +reason to advocate locking simplicity. +</para> + <para> +The third concern is addressed below: there are some methods to reduce +the amount of locking which needs to be done. +</para> + + <sect1 id="efficiency-rwlocks"> + <title>Read/Write Lock Variants</title> + + <para> + Both spinlocks and mutexes have read/write variants: + <type>rwlock_t</type> and <structname>struct rw_semaphore</structname>. + These divide users into two classes: the readers and the writers. If + you are only reading the data, you can get a read lock, but to write to + the data you need the write lock. Many people can hold a read lock, + but a writer must be sole holder. + </para> + + <para> + If your code divides neatly along reader/writer lines (as our + cache code does), and the lock is held by readers for + significant lengths of time, using these locks can help. They + are slightly slower than the normal locks though, so in practice + <type>rwlock_t</type> is not usually worthwhile. + </para> + </sect1> + + <sect1 id="efficiency-read-copy-update"> + <title>Avoiding Locks: Read Copy Update</title> + + <para> + There is a special method of read/write locking called Read Copy + Update. Using RCU, the readers can avoid taking a lock + altogether: as we expect our cache to be read more often than + updated (otherwise the cache is a waste of time), it is a + candidate for this optimization. + </para> + + <para> + How do we get rid of read locks? Getting rid of read locks + means that writers may be changing the list underneath the + readers. That is actually quite simple: we can read a linked + list while an element is being added if the writer adds the + element very carefully. For example, adding + <symbol>new</symbol> to a single linked list called + <symbol>list</symbol>: + </para> + + <programlisting> + new->next = list->next; + wmb(); + list->next = new; + </programlisting> + + <para> + The <function>wmb()</function> is a write memory barrier. It + ensures that the first operation (setting the new element's + <symbol>next</symbol> pointer) is complete and will be seen by + all CPUs, before the second operation is (putting the new + element into the list). This is important, since modern + compilers and modern CPUs can both reorder instructions unless + told otherwise: we want a reader to either not see the new + element at all, or see the new element with the + <symbol>next</symbol> pointer correctly pointing at the rest of + the list. + </para> + <para> + Fortunately, there is a function to do this for standard + <structname>struct list_head</structname> lists: + <function>list_add_rcu()</function> + (<filename>include/linux/list.h</filename>). + </para> + <para> + Removing an element from the list is even simpler: we replace + the pointer to the old element with a pointer to its successor, + and readers will either see it, or skip over it. + </para> + <programlisting> + list->next = old->next; + </programlisting> + <para> + There is <function>list_del_rcu()</function> + (<filename>include/linux/list.h</filename>) which does this (the + normal version poisons the old object, which we don't want). + </para> + <para> + The reader must also be careful: some CPUs can look through the + <symbol>next</symbol> pointer to start reading the contents of + the next element early, but don't realize that the pre-fetched + contents is wrong when the <symbol>next</symbol> pointer changes + underneath them. Once again, there is a + <function>list_for_each_entry_rcu()</function> + (<filename>include/linux/list.h</filename>) to help you. Of + course, writers can just use + <function>list_for_each_entry()</function>, since there cannot + be two simultaneous writers. + </para> + <para> + Our final dilemma is this: when can we actually destroy the + removed element? Remember, a reader might be stepping through + this element in the list right now: if we free this element and + the <symbol>next</symbol> pointer changes, the reader will jump + off into garbage and crash. We need to wait until we know that + all the readers who were traversing the list when we deleted the + element are finished. We use <function>call_rcu()</function> to + register a callback which will actually destroy the object once + the readers are finished. + </para> + <para> + But how does Read Copy Update know when the readers are + finished? The method is this: firstly, the readers always + traverse the list inside + <function>rcu_read_lock()</function>/<function>rcu_read_unlock()</function> + pairs: these simply disable preemption so the reader won't go to + sleep while reading the list. + </para> + <para> + RCU then waits until every other CPU has slept at least once: + since readers cannot sleep, we know that any readers which were + traversing the list during the deletion are finished, and the + callback is triggered. The real Read Copy Update code is a + little more optimized than this, but this is the fundamental + idea. + </para> + +<programlisting> +--- cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100 ++++ cache.c.rcupdate 2003-12-11 17:55:14.000000000 +1100 +@@ -1,15 +1,18 @@ + #include <linux/list.h> + #include <linux/slab.h> + #include <linux/string.h> ++#include <linux/rcupdate.h> + #include <linux/mutex.h> + #include <asm/errno.h> + + struct object + { +- /* These two protected by cache_lock. */ ++ /* This is protected by RCU */ + struct list_head list; + int popularity; + ++ struct rcu_head rcu; ++ + atomic_t refcnt; + + /* Doesn't change once created. */ +@@ -40,7 +43,7 @@ + { + struct object *i; + +- list_for_each_entry(i, &cache, list) { ++ list_for_each_entry_rcu(i, &cache, list) { + if (i->id == id) { + i->popularity++; + return i; +@@ -49,19 +52,25 @@ + return NULL; + } + ++/* Final discard done once we know no readers are looking. */ ++static void cache_delete_rcu(void *arg) ++{ ++ object_put(arg); ++} ++ + /* Must be holding cache_lock */ + static void __cache_delete(struct object *obj) + { + BUG_ON(!obj); +- list_del(&obj->list); +- object_put(obj); ++ list_del_rcu(&obj->list); + cache_num--; ++ call_rcu(&obj->rcu, cache_delete_rcu, obj); + } + + /* Must be holding cache_lock */ + static void __cache_add(struct object *obj) + { +- list_add(&obj->list, &cache); ++ list_add_rcu(&obj->list, &cache); + if (++cache_num > MAX_CACHE_SIZE) { + struct object *i, *outcast = NULL; + list_for_each_entry(i, &cache, list) { +@@ -85,6 +94,7 @@ + obj->popularity = 0; + atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ + spin_lock_init(&obj->lock); ++ INIT_RCU_HEAD(&obj->rcu); + + spin_lock_irqsave(&cache_lock, flags); + __cache_add(obj); +@@ -104,12 +114,11 @@ + struct object *cache_find(int id) + { + struct object *obj; +- unsigned long flags; + +- spin_lock_irqsave(&cache_lock, flags); ++ rcu_read_lock(); + obj = __cache_find(id); + if (obj) + object_get(obj); +- spin_unlock_irqrestore(&cache_lock, flags); ++ rcu_read_unlock(); + return obj; + } +</programlisting> + +<para> +Note that the reader will alter the +<structfield>popularity</structfield> member in +<function>__cache_find()</function>, and now it doesn't hold a lock. +One solution would be to make it an <type>atomic_t</type>, but for +this usage, we don't really care about races: an approximate result is +good enough, so I didn't change it. +</para> + +<para> +The result is that <function>cache_find()</function> requires no +synchronization with any other functions, so is almost as fast on SMP +as it would be on UP. +</para> + +<para> +There is a furthur optimization possible here: remember our original +cache code, where there were no reference counts and the caller simply +held the lock whenever using the object? This is still possible: if +you hold the lock, noone can delete the object, so you don't need to +get and put the reference count. +</para> + +<para> +Now, because the 'read lock' in RCU is simply disabling preemption, a +caller which always has preemption disabled between calling +<function>cache_find()</function> and +<function>object_put()</function> does not need to actually get and +put the reference count: we could expose +<function>__cache_find()</function> by making it non-static, and +such callers could simply call that. +</para> +<para> +The benefit here is that the reference count is not written to: the +object is not altered in any way, which is much faster on SMP +machines due to caching. +</para> + </sect1> + + <sect1 id="per-cpu"> + <title>Per-CPU Data</title> + + <para> + Another technique for avoiding locking which is used fairly + widely is to duplicate information for each CPU. For example, + if you wanted to keep a count of a common condition, you could + use a spin lock and a single counter. Nice and simple. + </para> + + <para> + If that was too slow (it's usually not, but if you've got a + really big machine to test on and can show that it is), you + could instead use a counter for each CPU, then none of them need + an exclusive lock. See <function>DEFINE_PER_CPU()</function>, + <function>get_cpu_var()</function> and + <function>put_cpu_var()</function> + (<filename class="headerfile">include/linux/percpu.h</filename>). + </para> + + <para> + Of particular use for simple per-cpu counters is the + <type>local_t</type> type, and the + <function>cpu_local_inc()</function> and related functions, + which are more efficient than simple code on some architectures + (<filename class="headerfile">include/asm/local.h</filename>). + </para> + + <para> + Note that there is no simple, reliable way of getting an exact + value of such a counter, without introducing more locks. This + is not a problem for some uses. + </para> + </sect1> + + <sect1 id="mostly-hardirq"> + <title>Data Which Mostly Used By An IRQ Handler</title> + + <para> + If data is always accessed from within the same IRQ handler, you + don't need a lock at all: the kernel already guarantees that the + irq handler will not run simultaneously on multiple CPUs. + </para> + <para> + Manfred Spraul points out that you can still do this, even if + the data is very occasionally accessed in user context or + softirqs/tasklets. The irq handler doesn't use a lock, and + all other accesses are done as so: + </para> + +<programlisting> + spin_lock(&lock); + disable_irq(irq); + ... + enable_irq(irq); + spin_unlock(&lock); +</programlisting> + <para> + The <function>disable_irq()</function> prevents the irq handler + from running (and waits for it to finish if it's currently + running on other CPUs). The spinlock prevents any other + accesses happening at the same time. Naturally, this is slower + than just a <function>spin_lock_irq()</function> call, so it + only makes sense if this type of access happens extremely + rarely. + </para> + </sect1> + </chapter> + + <chapter id="sleeping-things"> + <title>What Functions Are Safe To Call From Interrupts?</title> + + <para> + Many functions in the kernel sleep (ie. call schedule()) + directly or indirectly: you can never call them while holding a + spinlock, or with preemption disabled. This also means you need + to be in user context: calling them from an interrupt is illegal. + </para> + + <sect1 id="sleeping"> + <title>Some Functions Which Sleep</title> + + <para> + The most common ones are listed below, but you usually have to + read the code to find out if other calls are safe. If everyone + else who calls it can sleep, you probably need to be able to + sleep, too. In particular, registration and deregistration + functions usually expect to be called from user context, and can + sleep. + </para> + + <itemizedlist> + <listitem> + <para> + Accesses to + <firstterm linkend="gloss-userspace">userspace</firstterm>: + </para> + <itemizedlist> + <listitem> + <para> + <function>copy_from_user()</function> + </para> + </listitem> + <listitem> + <para> + <function>copy_to_user()</function> + </para> + </listitem> + <listitem> + <para> + <function>get_user()</function> + </para> + </listitem> + <listitem> + <para> + <function>put_user()</function> + </para> + </listitem> + </itemizedlist> + </listitem> + + <listitem> + <para> + <function>kmalloc(GFP_KERNEL)</function> + </para> + </listitem> + + <listitem> + <para> + <function>mutex_lock_interruptible()</function> and + <function>mutex_lock()</function> + </para> + <para> + There is a <function>mutex_trylock()</function> which can be + used inside interrupt context, as it will not sleep. + <function>mutex_unlock()</function> will also never sleep. + </para> + </listitem> + </itemizedlist> + </sect1> + + <sect1 id="dont-sleep"> + <title>Some Functions Which Don't Sleep</title> + + <para> + Some functions are safe to call from any context, or holding + almost any lock. + </para> + + <itemizedlist> + <listitem> + <para> + <function>printk()</function> + </para> + </listitem> + <listitem> + <para> + <function>kfree()</function> + </para> + </listitem> + <listitem> + <para> + <function>add_timer()</function> and <function>del_timer()</function> + </para> + </listitem> + </itemizedlist> + </sect1> + </chapter> + + <chapter id="references"> + <title>Further reading</title> + + <itemizedlist> + <listitem> + <para> + <filename>Documentation/spinlocks.txt</filename>: + Linus Torvalds' spinlocking tutorial in the kernel sources. + </para> + </listitem> + + <listitem> + <para> + Unix Systems for Modern Architectures: Symmetric + Multiprocessing and Caching for Kernel Programmers: + </para> + + <para> + Curt Schimmel's very good introduction to kernel level + locking (not written for Linux, but nearly everything + applies). The book is expensive, but really worth every + penny to understand SMP locking. [ISBN: 0201633388] + </para> + </listitem> + </itemizedlist> + </chapter> + + <chapter id="thanks"> + <title>Thanks</title> + + <para> + Thanks to Telsa Gwynne for DocBooking, neatening and adding + style. + </para> + + <para> + Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul + Mackerras, Ruedi Aschwanden, Alan Cox, Manfred Spraul, Tim + Waugh, Pete Zaitcev, James Morris, Robert Love, Paul McKenney, + John Ashby for proofreading, correcting, flaming, commenting. + </para> + + <para> + Thanks to the cabal for having no influence on this document. + </para> + </chapter> + + <glossary id="glossary"> + <title>Glossary</title> + + <glossentry id="gloss-preemption"> + <glossterm>preemption</glossterm> + <glossdef> + <para> + Prior to 2.5, or when <symbol>CONFIG_PREEMPT</symbol> is + unset, processes in user context inside the kernel would not + preempt each other (ie. you had that CPU until you gave it up, + except for interrupts). With the addition of + <symbol>CONFIG_PREEMPT</symbol> in 2.5.4, this changed: when + in user context, higher priority tasks can "cut in": spinlocks + were changed to disable preemption, even on UP. + </para> + </glossdef> + </glossentry> + + <glossentry id="gloss-bh"> + <glossterm>bh</glossterm> + <glossdef> + <para> + Bottom Half: for historical reasons, functions with + '_bh' in them often now refer to any software interrupt, e.g. + <function>spin_lock_bh()</function> blocks any software interrupt + on the current CPU. Bottom halves are deprecated, and will + eventually be replaced by tasklets. Only one bottom half will be + running at any time. + </para> + </glossdef> + </glossentry> + + <glossentry id="gloss-hwinterrupt"> + <glossterm>Hardware Interrupt / Hardware IRQ</glossterm> + <glossdef> + <para> + Hardware interrupt request. <function>in_irq()</function> returns + <returnvalue>true</returnvalue> in a hardware interrupt handler. + </para> + </glossdef> + </glossentry> + + <glossentry id="gloss-interruptcontext"> + <glossterm>Interrupt Context</glossterm> + <glossdef> + <para> + Not user context: processing a hardware irq or software irq. + Indicated by the <function>in_interrupt()</function> macro + returning <returnvalue>true</returnvalue>. + </para> + </glossdef> + </glossentry> + + <glossentry id="gloss-smp"> + <glossterm><acronym>SMP</acronym></glossterm> + <glossdef> + <para> + Symmetric Multi-Processor: kernels compiled for multiple-CPU + machines. (CONFIG_SMP=y). + </para> + </glossdef> + </glossentry> + + <glossentry id="gloss-softirq"> + <glossterm>Software Interrupt / softirq</glossterm> + <glossdef> + <para> + Software interrupt handler. <function>in_irq()</function> returns + <returnvalue>false</returnvalue>; <function>in_softirq()</function> + returns <returnvalue>true</returnvalue>. Tasklets and softirqs + both fall into the category of 'software interrupts'. + </para> + <para> + Strictly speaking a softirq is one of up to 32 enumerated software + interrupts which can run on multiple CPUs at once. + Sometimes used to refer to tasklets as + well (ie. all software interrupts). + </para> + </glossdef> + </glossentry> + + <glossentry id="gloss-tasklet"> + <glossterm>tasklet</glossterm> + <glossdef> + <para> + A dynamically-registrable software interrupt, + which is guaranteed to only run on one CPU at a time. + </para> + </glossdef> + </glossentry> + + <glossentry id="gloss-timers"> + <glossterm>timer</glossterm> + <glossdef> + <para> + A dynamically-registrable software interrupt, which is run at + (or close to) a given time. When running, it is just like a + tasklet (in fact, they are called from the TIMER_SOFTIRQ). + </para> + </glossdef> + </glossentry> + + <glossentry id="gloss-up"> + <glossterm><acronym>UP</acronym></glossterm> + <glossdef> + <para> + Uni-Processor: Non-SMP. (CONFIG_SMP=n). + </para> + </glossdef> + </glossentry> + + <glossentry id="gloss-usercontext"> + <glossterm>User Context</glossterm> + <glossdef> + <para> + The kernel executing on behalf of a particular process (ie. a + system call or trap) or kernel thread. You can tell which + process with the <symbol>current</symbol> macro.) Not to + be confused with userspace. Can be interrupted by software or + hardware interrupts. + </para> + </glossdef> + </glossentry> + + <glossentry id="gloss-userspace"> + <glossterm>Userspace</glossterm> + <glossdef> + <para> + A process executing its own code outside the kernel. + </para> + </glossdef> + </glossentry> + + </glossary> +</book> + diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl new file mode 100644 index 0000000..372dec2 --- /dev/null +++ b/Documentation/DocBook/kgdb.tmpl @@ -0,0 +1,459 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="kgdbOnLinux"> + <bookinfo> + <title>Using kgdb and the kgdb Internals</title> + + <authorgroup> + <author> + <firstname>Jason</firstname> + <surname>Wessel</surname> + <affiliation> + <address> + <email>jason.wessel@windriver.com</email> + </address> + </affiliation> + </author> + </authorgroup> + + <authorgroup> + <author> + <firstname>Tom</firstname> + <surname>Rini</surname> + <affiliation> + <address> + <email>trini@kernel.crashing.org</email> + </address> + </affiliation> + </author> + </authorgroup> + + <authorgroup> + <author> + <firstname>Amit S.</firstname> + <surname>Kale</surname> + <affiliation> + <address> + <email>amitkale@linsyssoft.com</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2008</year> + <holder>Wind River Systems, Inc.</holder> + </copyright> + <copyright> + <year>2004-2005</year> + <holder>MontaVista Software, Inc.</holder> + </copyright> + <copyright> + <year>2004</year> + <holder>Amit S. Kale</holder> + </copyright> + + <legalnotice> + <para> + This file is licensed under the terms of the GNU General Public License + version 2. This program is licensed "as is" without any warranty of any + kind, whether express or implied. + </para> + + </legalnotice> + </bookinfo> + +<toc></toc> + <chapter id="Introduction"> + <title>Introduction</title> + <para> + kgdb is a source level debugger for linux kernel. It is used along + with gdb to debug a linux kernel. The expectation is that gdb can + be used to "break in" to the kernel to inspect memory, variables + and look through call stack information similar to what an + application developer would use gdb for. It is possible to place + breakpoints in kernel code and perform some limited execution + stepping. + </para> + <para> + Two machines are required for using kgdb. One of these machines is a + development machine and the other is a test machine. The kernel + to be debugged runs on the test machine. The development machine + runs an instance of gdb against the vmlinux file which contains + the symbols (not boot image such as bzImage, zImage, uImage...). + In gdb the developer specifies the connection parameters and + connects to kgdb. The type of connection a developer makes with + gdb depends on the availability of kgdb I/O modules compiled as + builtin's or kernel modules in the test machine's kernel. + </para> + </chapter> + <chapter id="CompilingAKernel"> + <title>Compiling a kernel</title> + <para> + To enable <symbol>CONFIG_KGDB</symbol> you should first turn on + "Prompt for development and/or incomplete code/drivers" + (CONFIG_EXPERIMENTAL) in "General setup", then under the + "Kernel debugging" select "KGDB: kernel debugging with remote gdb". + </para> + <para> + It is advised, but not required that you turn on the + CONFIG_FRAME_POINTER kernel option. This option inserts code to + into the compiled executable which saves the frame information in + registers or on the stack at different points which will allow a + debugger such as gdb to more accurately construct stack back traces + while debugging the kernel. + </para> + <para> + If the architecture that you are using supports the kernel option + CONFIG_DEBUG_RODATA, you should consider turning it off. This + option will prevent the use of software breakpoints because it + marks certain regions of the kernel's memory space as read-only. + If kgdb supports it for the architecture you are using, you can + use hardware breakpoints if you desire to run with the + CONFIG_DEBUG_RODATA option turned on, else you need to turn off + this option. + </para> + <para> + Next you should choose one of more I/O drivers to interconnect debugging + host and debugged target. Early boot debugging requires a KGDB + I/O driver that supports early debugging and the driver must be + built into the kernel directly. Kgdb I/O driver configuration + takes place via kernel or module parameters, see following + chapter. + </para> + <para> + The kgdb test compile options are described in the kgdb test suite chapter. + </para> + + </chapter> + <chapter id="EnableKGDB"> + <title>Enable kgdb for debugging</title> + <para> + In order to use kgdb you must activate it by passing configuration + information to one of the kgdb I/O drivers. If you do not pass any + configuration information kgdb will not do anything at all. Kgdb + will only actively hook up to the kernel trap hooks if a kgdb I/O + driver is loaded and configured. If you unconfigure a kgdb I/O + driver, kgdb will unregister all the kernel hook points. + </para> + <para> + All drivers can be reconfigured at run time, if + <symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol> + are enabled, by echo'ing a new config string to + <constant>/sys/module/<driver>/parameter/<option></constant>. + The driver can be unconfigured by passing an empty string. You cannot + change the configuration while the debugger is attached. Make sure + to detach the debugger with the <constant>detach</constant> command + prior to trying unconfigure a kgdb I/O driver. + </para> + <sect1 id="kgdbwait"> + <title>Kernel parameter: kgdbwait</title> + <para> + The Kernel command line option <constant>kgdbwait</constant> makes + kgdb wait for a debugger connection during booting of a kernel. You + can only use this option you compiled a kgdb I/O driver into the + kernel and you specified the I/O driver configuration as a kernel + command line option. The kgdbwait parameter should always follow the + configuration parameter for the kgdb I/O driver in the kernel + command line else the I/O driver will not be configured prior to + asking the kernel to use it to wait. + </para> + <para> + The kernel will stop and wait as early as the I/O driver and + architecture will allow when you use this option. If you build the + kgdb I/O driver as a kernel module kgdbwait will not do anything. + </para> + </sect1> + <sect1 id="kgdboc"> + <title>Kernel parameter: kgdboc</title> + <para> + The kgdboc driver was originally an abbreviation meant to stand for + "kgdb over console". Kgdboc is designed to work with a single + serial port. It was meant to cover the circumstance + where you wanted to use a serial console as your primary console as + well as using it to perform kernel debugging. Of course you can + also use kgdboc without assigning a console to the same port. + </para> + <sect2 id="UsingKgdboc"> + <title>Using kgdboc</title> + <para> + You can configure kgdboc via sysfs or a module or kernel boot line + parameter depending on if you build with CONFIG_KGDBOC as a module + or built-in. + <orderedlist> + <listitem><para>From the module load or build-in</para> + <para><constant>kgdboc=<tty-device>,[baud]</constant></para> + <para> + The example here would be if your console port was typically ttyS0, you would use something like <constant>kgdboc=ttyS0,115200</constant> or on the ARM Versatile AB you would likely use <constant>kgdboc=ttyAMA0,115200</constant> + </para> + </listitem> + <listitem><para>From sysfs</para> + <para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para> + </listitem> + </orderedlist> + </para> + <para> + NOTE: Kgdboc does not support interrupting the target via the + gdb remote protocol. You must manually send a sysrq-g unless you + have a proxy that splits console output to a terminal problem and + has a separate port for the debugger to connect to that sends the + sysrq-g for you. + </para> + <para>When using kgdboc with no debugger proxy, you can end up + connecting the debugger for one of two entry points. If an + exception occurs after you have loaded kgdboc a message should print + on the console stating it is waiting for the debugger. In case you + disconnect your terminal program and then connect the debugger in + its place. If you want to interrupt the target system and forcibly + enter a debug session you have to issue a Sysrq sequence and then + type the letter <constant>g</constant>. Then you disconnect the + terminal session and connect gdb. Your options if you don't like + this are to hack gdb to send the sysrq-g for you as well as on the + initial connect, or to use a debugger proxy that allows an + unmodified gdb to do the debugging. + </para> + </sect2> + </sect1> + <sect1 id="kgdbcon"> + <title>Kernel parameter: kgdbcon</title> + <para> + Kgdb supports using the gdb serial protocol to send console messages + to the debugger when the debugger is connected and running. There + are two ways to activate this feature. + <orderedlist> + <listitem><para>Activate with the kernel command line option:</para> + <para><constant>kgdbcon</constant></para> + </listitem> + <listitem><para>Use sysfs before configuring an io driver</para> + <para> + <constant>echo 1 > /sys/module/kgdb/parameters/kgdb_use_con</constant> + </para> + <para> + NOTE: If you do this after you configure the kgdb I/O driver, the + setting will not take effect until the next point the I/O is + reconfigured. + </para> + </listitem> + </orderedlist> + </para> + <para> + IMPORTANT NOTE: Using this option with kgdb over the console + (kgdboc) is not supported. + </para> + </sect1> + </chapter> + <chapter id="ConnectingGDB"> + <title>Connecting gdb</title> + <para> + If you are using kgdboc, you need to have used kgdbwait as a boot + argument, issued a sysrq-g, or the system you are going to debug + has already taken an exception and is waiting for the debugger to + attach before you can connect gdb. + </para> + <para> + If you are not using different kgdb I/O driver other than kgdboc, + you should be able to connect and the target will automatically + respond. + </para> + <para> + Example (using a serial port): + </para> + <programlisting> + % gdb ./vmlinux + (gdb) set remotebaud 115200 + (gdb) target remote /dev/ttyS0 + </programlisting> + <para> + Example (kgdb to a terminal server on tcp port 2012): + </para> + <programlisting> + % gdb ./vmlinux + (gdb) target remote 192.168.2.2:2012 + </programlisting> + <para> + Once connected, you can debug a kernel the way you would debug an + application program. + </para> + <para> + If you are having problems connecting or something is going + seriously wrong while debugging, it will most often be the case + that you want to enable gdb to be verbose about its target + communications. You do this prior to issuing the <constant>target + remote</constant> command by typing in: <constant>set remote debug 1</constant> + </para> + </chapter> + <chapter id="KGDBTestSuite"> + <title>kgdb Test Suite</title> + <para> + When kgdb is enabled in the kernel config you can also elect to + enable the config parameter KGDB_TESTS. Turning this on will + enable a special kgdb I/O module which is designed to test the + kgdb internal functions. + </para> + <para> + The kgdb tests are mainly intended for developers to test the kgdb + internals as well as a tool for developing a new kgdb architecture + specific implementation. These tests are not really for end users + of the Linux kernel. The primary source of documentation would be + to look in the drivers/misc/kgdbts.c file. + </para> + <para> + The kgdb test suite can also be configured at compile time to run + the core set of tests by setting the kernel config parameter + KGDB_TESTS_ON_BOOT. This particular option is aimed at automated + regression testing and does not require modifying the kernel boot + config arguments. If this is turned on, the kgdb test suite can + be disabled by specifying "kgdbts=" as a kernel boot argument. + </para> + </chapter> + <chapter id="CommonBackEndReq"> + <title>KGDB Internals</title> + <sect1 id="kgdbArchitecture"> + <title>Architecture Specifics</title> + <para> + Kgdb is organized into three basic components: + <orderedlist> + <listitem><para>kgdb core</para> + <para> + The kgdb core is found in kernel/kgdb.c. It contains: + <itemizedlist> + <listitem><para>All the logic to implement the gdb serial protocol</para></listitem> + <listitem><para>A generic OS exception handler which includes sync'ing the processors into a stopped state on an multi cpu system.</para></listitem> + <listitem><para>The API to talk to the kgdb I/O drivers</para></listitem> + <listitem><para>The API to make calls to the arch specific kgdb implementation</para></listitem> + <listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem> + <listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem> + </itemizedlist> + </para> + </listitem> + <listitem><para>kgdb arch specific implementation</para> + <para> + This implementation is generally found in arch/*/kernel/kgdb.c. + As an example, arch/x86/kernel/kgdb.c contains the specifics to + implement HW breakpoint as well as the initialization to + dynamically register and unregister for the trap handlers on + this architecture. The arch specific portion implements: + <itemizedlist> + <listitem><para>contains an arch specific trap catcher which + invokes kgdb_handle_exception() to start kgdb about doing its + work</para></listitem> + <listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem> + <listitem><para>Registration and unregistration of architecture specific trap hooks</para></listitem> + <listitem><para>Any special exception handling and cleanup</para></listitem> + <listitem><para>NMI exception handling and cleanup</para></listitem> + <listitem><para>(optional)HW breakpoints</para></listitem> + </itemizedlist> + </para> + </listitem> + <listitem><para>kgdb I/O driver</para> + <para> + Each kgdb I/O driver has to provide an implemenation for the following: + <itemizedlist> + <listitem><para>configuration via builtin or module</para></listitem> + <listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem> + <listitem><para>read and write character interface</para></listitem> + <listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem> + <listitem><para>(optional) Early debug methodology</para></listitem> + </itemizedlist> + Any given kgdb I/O driver has to operate very closely with the + hardware and must do it in such a way that does not enable + interrupts or change other parts of the system context without + completely restoring them. The kgdb core will repeatedly "poll" + a kgdb I/O driver for characters when it needs input. The I/O + driver is expected to return immediately if there is no data + available. Doing so allows for the future possibility to touch + watch dog hardware in such a way as to have a target system not + reset when these are enabled. + </para> + </listitem> + </orderedlist> + </para> + <para> + If you are intent on adding kgdb architecture specific support + for a new architecture, the architecture should define + <constant>HAVE_ARCH_KGDB</constant> in the architecture specific + Kconfig file. This will enable kgdb for the architecture, and + at that point you must create an architecture specific kgdb + implementation. + </para> + <para> + There are a few flags which must be set on every architecture in + their <asm/kgdb.h> file. These are: + <itemizedlist> + <listitem> + <para> + NUMREGBYTES: The size in bytes of all of the registers, so + that we can ensure they will all fit into a packet. + </para> + <para> + BUFMAX: The size in bytes of the buffer GDB will read into. + This must be larger than NUMREGBYTES. + </para> + <para> + CACHE_FLUSH_IS_SAFE: Set to 1 if it is always safe to call + flush_cache_range or flush_icache_range. On some architectures, + these functions may not be safe to call on SMP since we keep other + CPUs in a holding pattern. + </para> + </listitem> + </itemizedlist> + </para> + <para> + There are also the following functions for the common backend, + found in kernel/kgdb.c, that must be supplied by the + architecture-specific backend unless marked as (optional), in + which case a default function maybe used if the architecture + does not need to provide a specific implementation. + </para> +!Iinclude/linux/kgdb.h + </sect1> + <sect1 id="kgdbocDesign"> + <title>kgdboc internals</title> + <para> + The kgdboc driver is actually a very thin driver that relies on the + underlying low level to the hardware driver having "polling hooks" + which the to which the tty driver is attached. In the initial + implementation of kgdboc it the serial_core was changed to expose a + low level uart hook for doing polled mode reading and writing of a + single character while in an atomic context. When kgdb makes an I/O + request to the debugger, kgdboc invokes a call back in the serial + core which in turn uses the call back in the uart driver. It is + certainly possible to extend kgdboc to work with non-uart based + consoles in the future. + </para> + <para> + When using kgdboc with a uart, the uart driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting> +#ifdef CONFIG_CONSOLE_POLL + .poll_get_char = serial8250_get_poll_char, + .poll_put_char = serial8250_put_poll_char, +#endif + </programlisting> + Any implementation specifics around creating a polling driver use the + <constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above. + Keep in mind that polling hooks have to be implemented in such a way + that they can be called from an atomic context and have to restore + the state of the uart chip on return such that the system can return + to normal when the debugger detaches. You need to be very careful + with any kind of lock you consider, because failing here is most + going to mean pressing the reset button. + </para> + </sect1> + </chapter> + <chapter id="credits"> + <title>Credits</title> + <para> + The following people have contributed to this document: + <orderedlist> + <listitem><para>Amit Kale<email>amitkale@linsyssoft.com</email></para></listitem> + <listitem><para>Tom Rini<email>trini@kernel.crashing.org</email></para></listitem> + </orderedlist> + In March 2008 this document was completely rewritten by: + <itemizedlist> + <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem> + </itemizedlist> + </para> + </chapter> +</book> + diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl new file mode 100644 index 0000000..ba99757 --- /dev/null +++ b/Documentation/DocBook/libata.tmpl @@ -0,0 +1,1632 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="libataDevGuide"> + <bookinfo> + <title>libATA Developer's Guide</title> + + <authorgroup> + <author> + <firstname>Jeff</firstname> + <surname>Garzik</surname> + </author> + </authorgroup> + + <copyright> + <year>2003-2006</year> + <holder>Jeff Garzik</holder> + </copyright> + + <legalnotice> + <para> + The contents of this file are subject to the Open + Software License version 1.1 that can be found at + <ulink url="http://www.opensource.org/licenses/osl-1.1.txt">http://www.opensource.org/licenses/osl-1.1.txt</ulink> and is included herein + by reference. + </para> + + <para> + Alternatively, the contents of this file may be used under the terms + of the GNU General Public License version 2 (the "GPL") as distributed + in the kernel source COPYING file, in which case the provisions of + the GPL are applicable instead of the above. If you wish to allow + the use of your version of this file only under the terms of the + GPL and not to allow others to use your version of this file under + the OSL, indicate your decision by deleting the provisions above and + replace them with the notice and other provisions required by the GPL. + If you do not delete the provisions above, a recipient may use your + version of this file under either the OSL or the GPL. + </para> + + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="libataIntroduction"> + <title>Introduction</title> + <para> + libATA is a library used inside the Linux kernel to support ATA host + controllers and devices. libATA provides an ATA driver API, class + transports for ATA and ATAPI devices, and SCSI<->ATA translation + for ATA devices according to the T10 SAT specification. + </para> + <para> + This Guide documents the libATA driver API, library functions, library + internals, and a couple sample ATA low-level drivers. + </para> + </chapter> + + <chapter id="libataDriverApi"> + <title>libata Driver API</title> + <para> + struct ata_port_operations is defined for every low-level libata + hardware driver, and it controls how the low-level driver + interfaces with the ATA and SCSI layers. + </para> + <para> + FIS-based drivers will hook into the system with ->qc_prep() and + ->qc_issue() high-level hooks. Hardware which behaves in a manner + similar to PCI IDE hardware may utilize several generic helpers, + defining at a bare minimum the bus I/O addresses of the ATA shadow + register blocks. + </para> + <sect1> + <title>struct ata_port_operations</title> + + <sect2><title>Disable ATA port</title> + <programlisting> +void (*port_disable) (struct ata_port *); + </programlisting> + + <para> + Called from ata_bus_probe() and ata_bus_reset() error paths, + as well as when unregistering from the SCSI module (rmmod, hot + unplug). + This function should do whatever needs to be done to take the + port out of use. In most cases, ata_port_disable() can be used + as this hook. + </para> + <para> + Called from ata_bus_probe() on a failed probe. + Called from ata_bus_reset() on a failed bus reset. + Called from ata_scsi_release(). + </para> + + </sect2> + + <sect2><title>Post-IDENTIFY device configuration</title> + <programlisting> +void (*dev_config) (struct ata_port *, struct ata_device *); + </programlisting> + + <para> + Called after IDENTIFY [PACKET] DEVICE is issued to each device + found. Typically used to apply device-specific fixups prior to + issue of SET FEATURES - XFER MODE, and prior to operation. + </para> + <para> + Called by ata_device_add() after ata_dev_identify() determines + a device is present. + </para> + <para> + This entry may be specified as NULL in ata_port_operations. + </para> + + </sect2> + + <sect2><title>Set PIO/DMA mode</title> + <programlisting> +void (*set_piomode) (struct ata_port *, struct ata_device *); +void (*set_dmamode) (struct ata_port *, struct ata_device *); +void (*post_set_mode) (struct ata_port *); +unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned int); + </programlisting> + + <para> + Hooks called prior to the issue of SET FEATURES - XFER MODE + command. The optional ->mode_filter() hook is called when libata + has built a mask of the possible modes. This is passed to the + ->mode_filter() function which should return a mask of valid modes + after filtering those unsuitable due to hardware limits. It is not + valid to use this interface to add modes. + </para> + <para> + dev->pio_mode and dev->dma_mode are guaranteed to be valid when + ->set_piomode() and when ->set_dmamode() is called. The timings for + any other drive sharing the cable will also be valid at this point. + That is the library records the decisions for the modes of each + drive on a channel before it attempts to set any of them. + </para> + <para> + ->post_set_mode() is + called unconditionally, after the SET FEATURES - XFER MODE + command completes successfully. + </para> + + <para> + ->set_piomode() is always called (if present), but + ->set_dma_mode() is only called if DMA is possible. + </para> + + </sect2> + + <sect2><title>Taskfile read/write</title> + <programlisting> +void (*tf_load) (struct ata_port *ap, struct ata_taskfile *tf); +void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf); + </programlisting> + + <para> + ->tf_load() is called to load the given taskfile into hardware + registers / DMA buffers. ->tf_read() is called to read the + hardware registers / DMA buffers, to obtain the current set of + taskfile register values. + Most drivers for taskfile-based hardware (PIO or MMIO) use + ata_tf_load() and ata_tf_read() for these hooks. + </para> + + </sect2> + + <sect2><title>PIO data read/write</title> + <programlisting> +void (*data_xfer) (struct ata_device *, unsigned char *, unsigned int, int); + </programlisting> + + <para> +All bmdma-style drivers must implement this hook. This is the low-level +operation that actually copies the data bytes during a PIO data +transfer. +Typically the driver +will choose one of ata_pio_data_xfer_noirq(), ata_pio_data_xfer(), or +ata_mmio_data_xfer(). + </para> + + </sect2> + + <sect2><title>ATA command execute</title> + <programlisting> +void (*exec_command)(struct ata_port *ap, struct ata_taskfile *tf); + </programlisting> + + <para> + causes an ATA command, previously loaded with + ->tf_load(), to be initiated in hardware. + Most drivers for taskfile-based hardware use ata_exec_command() + for this hook. + </para> + + </sect2> + + <sect2><title>Per-cmd ATAPI DMA capabilities filter</title> + <programlisting> +int (*check_atapi_dma) (struct ata_queued_cmd *qc); + </programlisting> + + <para> +Allow low-level driver to filter ATA PACKET commands, returning a status +indicating whether or not it is OK to use DMA for the supplied PACKET +command. + </para> + <para> + This hook may be specified as NULL, in which case libata will + assume that atapi dma can be supported. + </para> + + </sect2> + + <sect2><title>Read specific ATA shadow registers</title> + <programlisting> +u8 (*check_status)(struct ata_port *ap); +u8 (*check_altstatus)(struct ata_port *ap); + </programlisting> + + <para> + Reads the Status/AltStatus ATA shadow register from + hardware. On some hardware, reading the Status register has + the side effect of clearing the interrupt condition. + Most drivers for taskfile-based hardware use + ata_check_status() for this hook. + </para> + <para> + Note that because this is called from ata_device_add(), at + least a dummy function that clears device interrupts must be + provided for all drivers, even if the controller doesn't + actually have a taskfile status register. + </para> + + </sect2> + + <sect2><title>Select ATA device on bus</title> + <programlisting> +void (*dev_select)(struct ata_port *ap, unsigned int device); + </programlisting> + + <para> + Issues the low-level hardware command(s) that causes one of N + hardware devices to be considered 'selected' (active and + available for use) on the ATA bus. This generally has no + meaning on FIS-based devices. + </para> + <para> + Most drivers for taskfile-based hardware use + ata_std_dev_select() for this hook. Controllers which do not + support second drives on a port (such as SATA contollers) will + use ata_noop_dev_select(). + </para> + + </sect2> + + <sect2><title>Private tuning method</title> + <programlisting> +void (*set_mode) (struct ata_port *ap); + </programlisting> + + <para> + By default libata performs drive and controller tuning in + accordance with the ATA timing rules and also applies blacklists + and cable limits. Some controllers need special handling and have + custom tuning rules, typically raid controllers that use ATA + commands but do not actually do drive timing. + </para> + + <warning> + <para> + This hook should not be used to replace the standard controller + tuning logic when a controller has quirks. Replacing the default + tuning logic in that case would bypass handling for drive and + bridge quirks that may be important to data reliability. If a + controller needs to filter the mode selection it should use the + mode_filter hook instead. + </para> + </warning> + + </sect2> + + <sect2><title>Control PCI IDE BMDMA engine</title> + <programlisting> +void (*bmdma_setup) (struct ata_queued_cmd *qc); +void (*bmdma_start) (struct ata_queued_cmd *qc); +void (*bmdma_stop) (struct ata_port *ap); +u8 (*bmdma_status) (struct ata_port *ap); + </programlisting> + + <para> +When setting up an IDE BMDMA transaction, these hooks arm +(->bmdma_setup), fire (->bmdma_start), and halt (->bmdma_stop) +the hardware's DMA engine. ->bmdma_status is used to read the standard +PCI IDE DMA Status register. + </para> + + <para> +These hooks are typically either no-ops, or simply not implemented, in +FIS-based drivers. + </para> + <para> +Most legacy IDE drivers use ata_bmdma_setup() for the bmdma_setup() +hook. ata_bmdma_setup() will write the pointer to the PRD table to +the IDE PRD Table Address register, enable DMA in the DMA Command +register, and call exec_command() to begin the transfer. + </para> + <para> +Most legacy IDE drivers use ata_bmdma_start() for the bmdma_start() +hook. ata_bmdma_start() will write the ATA_DMA_START flag to the DMA +Command register. + </para> + <para> +Many legacy IDE drivers use ata_bmdma_stop() for the bmdma_stop() +hook. ata_bmdma_stop() clears the ATA_DMA_START flag in the DMA +command register. + </para> + <para> +Many legacy IDE drivers use ata_bmdma_status() as the bmdma_status() hook. + </para> + + </sect2> + + <sect2><title>High-level taskfile hooks</title> + <programlisting> +void (*qc_prep) (struct ata_queued_cmd *qc); +int (*qc_issue) (struct ata_queued_cmd *qc); + </programlisting> + + <para> + Higher-level hooks, these two hooks can potentially supercede + several of the above taskfile/DMA engine hooks. ->qc_prep is + called after the buffers have been DMA-mapped, and is typically + used to populate the hardware's DMA scatter-gather table. + Most drivers use the standard ata_qc_prep() helper function, but + more advanced drivers roll their own. + </para> + <para> + ->qc_issue is used to make a command active, once the hardware + and S/G tables have been prepared. IDE BMDMA drivers use the + helper function ata_qc_issue_prot() for taskfile protocol-based + dispatch. More advanced drivers implement their own ->qc_issue. + </para> + <para> + ata_qc_issue_prot() calls ->tf_load(), ->bmdma_setup(), and + ->bmdma_start() as necessary to initiate a transfer. + </para> + + </sect2> + + <sect2><title>Exception and probe handling (EH)</title> + <programlisting> +void (*eng_timeout) (struct ata_port *ap); +void (*phy_reset) (struct ata_port *ap); + </programlisting> + + <para> +Deprecated. Use ->error_handler() instead. + </para> + + <programlisting> +void (*freeze) (struct ata_port *ap); +void (*thaw) (struct ata_port *ap); + </programlisting> + + <para> +ata_port_freeze() is called when HSM violations or some other +condition disrupts normal operation of the port. A frozen port +is not allowed to perform any operation until the port is +thawed, which usually follows a successful reset. + </para> + + <para> +The optional ->freeze() callback can be used for freezing the port +hardware-wise (e.g. mask interrupt and stop DMA engine). If a +port cannot be frozen hardware-wise, the interrupt handler +must ack and clear interrupts unconditionally while the port +is frozen. + </para> + <para> +The optional ->thaw() callback is called to perform the opposite of ->freeze(): +prepare the port for normal operation once again. Unmask interrupts, +start DMA engine, etc. + </para> + + <programlisting> +void (*error_handler) (struct ata_port *ap); + </programlisting> + + <para> +->error_handler() is a driver's hook into probe, hotplug, and recovery +and other exceptional conditions. The primary responsibility of an +implementation is to call ata_do_eh() or ata_bmdma_drive_eh() with a set +of EH hooks as arguments: + </para> + + <para> +'prereset' hook (may be NULL) is called during an EH reset, before any other actions +are taken. + </para> + + <para> +'postreset' hook (may be NULL) is called after the EH reset is performed. Based on +existing conditions, severity of the problem, and hardware capabilities, + </para> + + <para> +Either 'softreset' (may be NULL) or 'hardreset' (may be NULL) will be +called to perform the low-level EH reset. + </para> + + <programlisting> +void (*post_internal_cmd) (struct ata_queued_cmd *qc); + </programlisting> + + <para> +Perform any hardware-specific actions necessary to finish processing +after executing a probe-time or EH-time command via ata_exec_internal(). + </para> + + </sect2> + + <sect2><title>Hardware interrupt handling</title> + <programlisting> +irqreturn_t (*irq_handler)(int, void *, struct pt_regs *); +void (*irq_clear) (struct ata_port *); + </programlisting> + + <para> + ->irq_handler is the interrupt handling routine registered with + the system, by libata. ->irq_clear is called during probe just + before the interrupt handler is registered, to be sure hardware + is quiet. + </para> + <para> + The second argument, dev_instance, should be cast to a pointer + to struct ata_host_set. + </para> + <para> + Most legacy IDE drivers use ata_interrupt() for the + irq_handler hook, which scans all ports in the host_set, + determines which queued command was active (if any), and calls + ata_host_intr(ap,qc). + </para> + <para> + Most legacy IDE drivers use ata_bmdma_irq_clear() for the + irq_clear() hook, which simply clears the interrupt and error + flags in the DMA status register. + </para> + + </sect2> + + <sect2><title>SATA phy read/write</title> + <programlisting> +int (*scr_read) (struct ata_port *ap, unsigned int sc_reg, + u32 *val); +int (*scr_write) (struct ata_port *ap, unsigned int sc_reg, + u32 val); + </programlisting> + + <para> + Read and write standard SATA phy registers. Currently only used + if ->phy_reset hook called the sata_phy_reset() helper function. + sc_reg is one of SCR_STATUS, SCR_CONTROL, SCR_ERROR, or SCR_ACTIVE. + </para> + + </sect2> + + <sect2><title>Init and shutdown</title> + <programlisting> +int (*port_start) (struct ata_port *ap); +void (*port_stop) (struct ata_port *ap); +void (*host_stop) (struct ata_host_set *host_set); + </programlisting> + + <para> + ->port_start() is called just after the data structures for each + port are initialized. Typically this is used to alloc per-port + DMA buffers / tables / rings, enable DMA engines, and similar + tasks. Some drivers also use this entry point as a chance to + allocate driver-private memory for ap->private_data. + </para> + <para> + Many drivers use ata_port_start() as this hook or call + it from their own port_start() hooks. ata_port_start() + allocates space for a legacy IDE PRD table and returns. + </para> + <para> + ->port_stop() is called after ->host_stop(). It's sole function + is to release DMA/memory resources, now that they are no longer + actively being used. Many drivers also free driver-private + data from port at this time. + </para> + <para> + Many drivers use ata_port_stop() as this hook, which frees the + PRD table. + </para> + <para> + ->host_stop() is called after all ->port_stop() calls +have completed. The hook must finalize hardware shutdown, release DMA +and other resources, etc. + This hook may be specified as NULL, in which case it is not called. + </para> + + </sect2> + + </sect1> + </chapter> + + <chapter id="libataEH"> + <title>Error handling</title> + + <para> + This chapter describes how errors are handled under libata. + Readers are advised to read SCSI EH + (Documentation/scsi/scsi_eh.txt) and ATA exceptions doc first. + </para> + + <sect1><title>Origins of commands</title> + <para> + In libata, a command is represented with struct ata_queued_cmd + or qc. qc's are preallocated during port initialization and + repetitively used for command executions. Currently only one + qc is allocated per port but yet-to-be-merged NCQ branch + allocates one for each tag and maps each qc to NCQ tag 1-to-1. + </para> + <para> + libata commands can originate from two sources - libata itself + and SCSI midlayer. libata internal commands are used for + initialization and error handling. All normal blk requests + and commands for SCSI emulation are passed as SCSI commands + through queuecommand callback of SCSI host template. + </para> + </sect1> + + <sect1><title>How commands are issued</title> + + <variablelist> + + <varlistentry><term>Internal commands</term> + <listitem> + <para> + First, qc is allocated and initialized using + ata_qc_new_init(). Although ata_qc_new_init() doesn't + implement any wait or retry mechanism when qc is not + available, internal commands are currently issued only during + initialization and error recovery, so no other command is + active and allocation is guaranteed to succeed. + </para> + <para> + Once allocated qc's taskfile is initialized for the command to + be executed. qc currently has two mechanisms to notify + completion. One is via qc->complete_fn() callback and the + other is completion qc->waiting. qc->complete_fn() callback + is the asynchronous path used by normal SCSI translated + commands and qc->waiting is the synchronous (issuer sleeps in + process context) path used by internal commands. + </para> + <para> + Once initialization is complete, host_set lock is acquired + and the qc is issued. + </para> + </listitem> + </varlistentry> + + <varlistentry><term>SCSI commands</term> + <listitem> + <para> + All libata drivers use ata_scsi_queuecmd() as + hostt->queuecommand callback. scmds can either be simulated + or translated. No qc is involved in processing a simulated + scmd. The result is computed right away and the scmd is + completed. + </para> + <para> + For a translated scmd, ata_qc_new_init() is invoked to + allocate a qc and the scmd is translated into the qc. SCSI + midlayer's completion notification function pointer is stored + into qc->scsidone. + </para> + <para> + qc->complete_fn() callback is used for completion + notification. ATA commands use ata_scsi_qc_complete() while + ATAPI commands use atapi_qc_complete(). Both functions end up + calling qc->scsidone to notify upper layer when the qc is + finished. After translation is completed, the qc is issued + with ata_qc_issue(). + </para> + <para> + Note that SCSI midlayer invokes hostt->queuecommand while + holding host_set lock, so all above occur while holding + host_set lock. + </para> + </listitem> + </varlistentry> + + </variablelist> + </sect1> + + <sect1><title>How commands are processed</title> + <para> + Depending on which protocol and which controller are used, + commands are processed differently. For the purpose of + discussion, a controller which uses taskfile interface and all + standard callbacks is assumed. + </para> + <para> + Currently 6 ATA command protocols are used. They can be + sorted into the following four categories according to how + they are processed. + </para> + + <variablelist> + <varlistentry><term>ATA NO DATA or DMA</term> + <listitem> + <para> + ATA_PROT_NODATA and ATA_PROT_DMA fall into this category. + These types of commands don't require any software + intervention once issued. Device will raise interrupt on + completion. + </para> + </listitem> + </varlistentry> + + <varlistentry><term>ATA PIO</term> + <listitem> + <para> + ATA_PROT_PIO is in this category. libata currently + implements PIO with polling. ATA_NIEN bit is set to turn + off interrupt and pio_task on ata_wq performs polling and + IO. + </para> + </listitem> + </varlistentry> + + <varlistentry><term>ATAPI NODATA or DMA</term> + <listitem> + <para> + ATA_PROT_ATAPI_NODATA and ATA_PROT_ATAPI_DMA are in this + category. packet_task is used to poll BSY bit after + issuing PACKET command. Once BSY is turned off by the + device, packet_task transfers CDB and hands off processing + to interrupt handler. + </para> + </listitem> + </varlistentry> + + <varlistentry><term>ATAPI PIO</term> + <listitem> + <para> + ATA_PROT_ATAPI is in this category. ATA_NIEN bit is set + and, as in ATAPI NODATA or DMA, packet_task submits cdb. + However, after submitting cdb, further processing (data + transfer) is handed off to pio_task. + </para> + </listitem> + </varlistentry> + </variablelist> + </sect1> + + <sect1><title>How commands are completed</title> + <para> + Once issued, all qc's are either completed with + ata_qc_complete() or time out. For commands which are handled + by interrupts, ata_host_intr() invokes ata_qc_complete(), and, + for PIO tasks, pio_task invokes ata_qc_complete(). In error + cases, packet_task may also complete commands. + </para> + <para> + ata_qc_complete() does the following. + </para> + + <orderedlist> + + <listitem> + <para> + DMA memory is unmapped. + </para> + </listitem> + + <listitem> + <para> + ATA_QCFLAG_ACTIVE is clared from qc->flags. + </para> + </listitem> + + <listitem> + <para> + qc->complete_fn() callback is invoked. If the return value of + the callback is not zero. Completion is short circuited and + ata_qc_complete() returns. + </para> + </listitem> + + <listitem> + <para> + __ata_qc_complete() is called, which does + <orderedlist> + + <listitem> + <para> + qc->flags is cleared to zero. + </para> + </listitem> + + <listitem> + <para> + ap->active_tag and qc->tag are poisoned. + </para> + </listitem> + + <listitem> + <para> + qc->waiting is claread & completed (in that order). + </para> + </listitem> + + <listitem> + <para> + qc is deallocated by clearing appropriate bit in ap->qactive. + </para> + </listitem> + + </orderedlist> + </para> + </listitem> + + </orderedlist> + + <para> + So, it basically notifies upper layer and deallocates qc. One + exception is short-circuit path in #3 which is used by + atapi_qc_complete(). + </para> + <para> + For all non-ATAPI commands, whether it fails or not, almost + the same code path is taken and very little error handling + takes place. A qc is completed with success status if it + succeeded, with failed status otherwise. + </para> + <para> + However, failed ATAPI commands require more handling as + REQUEST SENSE is needed to acquire sense data. If an ATAPI + command fails, ata_qc_complete() is invoked with error status, + which in turn invokes atapi_qc_complete() via + qc->complete_fn() callback. + </para> + <para> + This makes atapi_qc_complete() set scmd->result to + SAM_STAT_CHECK_CONDITION, complete the scmd and return 1. As + the sense data is empty but scmd->result is CHECK CONDITION, + SCSI midlayer will invoke EH for the scmd, and returning 1 + makes ata_qc_complete() to return without deallocating the qc. + This leads us to ata_scsi_error() with partially completed qc. + </para> + + </sect1> + + <sect1><title>ata_scsi_error()</title> + <para> + ata_scsi_error() is the current transportt->eh_strategy_handler() + for libata. As discussed above, this will be entered in two + cases - timeout and ATAPI error completion. This function + calls low level libata driver's eng_timeout() callback, the + standard callback for which is ata_eng_timeout(). It checks + if a qc is active and calls ata_qc_timeout() on the qc if so. + Actual error handling occurs in ata_qc_timeout(). + </para> + <para> + If EH is invoked for timeout, ata_qc_timeout() stops BMDMA and + completes the qc. Note that as we're currently in EH, we + cannot call scsi_done. As described in SCSI EH doc, a + recovered scmd should be either retried with + scsi_queue_insert() or finished with scsi_finish_command(). + Here, we override qc->scsidone with scsi_finish_command() and + calls ata_qc_complete(). + </para> + <para> + If EH is invoked due to a failed ATAPI qc, the qc here is + completed but not deallocated. The purpose of this + half-completion is to use the qc as place holder to make EH + code reach this place. This is a bit hackish, but it works. + </para> + <para> + Once control reaches here, the qc is deallocated by invoking + __ata_qc_complete() explicitly. Then, internal qc for REQUEST + SENSE is issued. Once sense data is acquired, scmd is + finished by directly invoking scsi_finish_command() on the + scmd. Note that as we already have completed and deallocated + the qc which was associated with the scmd, we don't need + to/cannot call ata_qc_complete() again. + </para> + + </sect1> + + <sect1><title>Problems with the current EH</title> + + <itemizedlist> + + <listitem> + <para> + Error representation is too crude. Currently any and all + error conditions are represented with ATA STATUS and ERROR + registers. Errors which aren't ATA device errors are treated + as ATA device errors by setting ATA_ERR bit. Better error + descriptor which can properly represent ATA and other + errors/exceptions is needed. + </para> + </listitem> + + <listitem> + <para> + When handling timeouts, no action is taken to make device + forget about the timed out command and ready for new commands. + </para> + </listitem> + + <listitem> + <para> + EH handling via ata_scsi_error() is not properly protected + from usual command processing. On EH entrance, the device is + not in quiescent state. Timed out commands may succeed or + fail any time. pio_task and atapi_task may still be running. + </para> + </listitem> + + <listitem> + <para> + Too weak error recovery. Devices / controllers causing HSM + mismatch errors and other errors quite often require reset to + return to known state. Also, advanced error handling is + necessary to support features like NCQ and hotplug. + </para> + </listitem> + + <listitem> + <para> + ATA errors are directly handled in the interrupt handler and + PIO errors in pio_task. This is problematic for advanced + error handling for the following reasons. + </para> + <para> + First, advanced error handling often requires context and + internal qc execution. + </para> + <para> + Second, even a simple failure (say, CRC error) needs + information gathering and could trigger complex error handling + (say, resetting & reconfiguring). Having multiple code + paths to gather information, enter EH and trigger actions + makes life painful. + </para> + <para> + Third, scattered EH code makes implementing low level drivers + difficult. Low level drivers override libata callbacks. If + EH is scattered over several places, each affected callbacks + should perform its part of error handling. This can be error + prone and painful. + </para> + </listitem> + + </itemizedlist> + </sect1> + </chapter> + + <chapter id="libataExt"> + <title>libata Library</title> +!Edrivers/ata/libata-core.c + </chapter> + + <chapter id="libataInt"> + <title>libata Core Internals</title> +!Idrivers/ata/libata-core.c + </chapter> + + <chapter id="libataScsiInt"> + <title>libata SCSI translation/emulation</title> +!Edrivers/ata/libata-scsi.c +!Idrivers/ata/libata-scsi.c + </chapter> + + <chapter id="ataExceptions"> + <title>ATA errors and exceptions</title> + + <para> + This chapter tries to identify what error/exception conditions exist + for ATA/ATAPI devices and describe how they should be handled in + implementation-neutral way. + </para> + + <para> + The term 'error' is used to describe conditions where either an + explicit error condition is reported from device or a command has + timed out. + </para> + + <para> + The term 'exception' is either used to describe exceptional + conditions which are not errors (say, power or hotplug events), or + to describe both errors and non-error exceptional conditions. Where + explicit distinction between error and exception is necessary, the + term 'non-error exception' is used. + </para> + + <sect1 id="excat"> + <title>Exception categories</title> + <para> + Exceptions are described primarily with respect to legacy + taskfile + bus master IDE interface. If a controller provides + other better mechanism for error reporting, mapping those into + categories described below shouldn't be difficult. + </para> + + <para> + In the following sections, two recovery actions - reset and + reconfiguring transport - are mentioned. These are described + further in <xref linkend="exrec"/>. + </para> + + <sect2 id="excatHSMviolation"> + <title>HSM violation</title> + <para> + This error is indicated when STATUS value doesn't match HSM + requirement during issuing or excution any ATA/ATAPI command. + </para> + + <itemizedlist> + <title>Examples</title> + + <listitem> + <para> + ATA_STATUS doesn't contain !BSY && DRDY && !DRQ while trying + to issue a command. + </para> + </listitem> + + <listitem> + <para> + !BSY && !DRQ during PIO data transfer. + </para> + </listitem> + + <listitem> + <para> + DRQ on command completion. + </para> + </listitem> + + <listitem> + <para> + !BSY && ERR after CDB tranfer starts but before the + last byte of CDB is transferred. ATA/ATAPI standard states + that "The device shall not terminate the PACKET command + with an error before the last byte of the command packet has + been written" in the error outputs description of PACKET + command and the state diagram doesn't include such + transitions. + </para> + </listitem> + + </itemizedlist> + + <para> + In these cases, HSM is violated and not much information + regarding the error can be acquired from STATUS or ERROR + register. IOW, this error can be anything - driver bug, + faulty device, controller and/or cable. + </para> + + <para> + As HSM is violated, reset is necessary to restore known state. + Reconfiguring transport for lower speed might be helpful too + as transmission errors sometimes cause this kind of errors. + </para> + </sect2> + + <sect2 id="excatDevErr"> + <title>ATA/ATAPI device error (non-NCQ / non-CHECK CONDITION)</title> + + <para> + These are errors detected and reported by ATA/ATAPI devices + indicating device problems. For this type of errors, STATUS + and ERROR register values are valid and describe error + condition. Note that some of ATA bus errors are detected by + ATA/ATAPI devices and reported using the same mechanism as + device errors. Those cases are described later in this + section. + </para> + + <para> + For ATA commands, this type of errors are indicated by !BSY + && ERR during command execution and on completion. + </para> + + <para>For ATAPI commands,</para> + + <itemizedlist> + + <listitem> + <para> + !BSY && ERR && ABRT right after issuing PACKET + indicates that PACKET command is not supported and falls in + this category. + </para> + </listitem> + + <listitem> + <para> + !BSY && ERR(==CHK) && !ABRT after the last + byte of CDB is transferred indicates CHECK CONDITION and + doesn't fall in this category. + </para> + </listitem> + + <listitem> + <para> + !BSY && ERR(==CHK) && ABRT after the last byte + of CDB is transferred *probably* indicates CHECK CONDITION and + doesn't fall in this category. + </para> + </listitem> + + </itemizedlist> + + <para> + Of errors detected as above, the followings are not ATA/ATAPI + device errors but ATA bus errors and should be handled + according to <xref linkend="excatATAbusErr"/>. + </para> + + <variablelist> + + <varlistentry> + <term>CRC error during data transfer</term> + <listitem> + <para> + This is indicated by ICRC bit in the ERROR register and + means that corruption occurred during data transfer. Upto + ATA/ATAPI-7, the standard specifies that this bit is only + applicable to UDMA transfers but ATA/ATAPI-8 draft revision + 1f says that the bit may be applicable to multiword DMA and + PIO. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term>ABRT error during data transfer or on completion</term> + <listitem> + <para> + Upto ATA/ATAPI-7, the standard specifies that ABRT could be + set on ICRC errors and on cases where a device is not able + to complete a command. Combined with the fact that MWDMA + and PIO transfer errors aren't allowed to use ICRC bit upto + ATA/ATAPI-7, it seems to imply that ABRT bit alone could + indicate tranfer errors. + </para> + <para> + However, ATA/ATAPI-8 draft revision 1f removes the part + that ICRC errors can turn on ABRT. So, this is kind of + gray area. Some heuristics are needed here. + </para> + </listitem> + </varlistentry> + + </variablelist> + + <para> + ATA/ATAPI device errors can be further categorized as follows. + </para> + + <variablelist> + + <varlistentry> + <term>Media errors</term> + <listitem> + <para> + This is indicated by UNC bit in the ERROR register. ATA + devices reports UNC error only after certain number of + retries cannot recover the data, so there's nothing much + else to do other than notifying upper layer. + </para> + <para> + READ and WRITE commands report CHS or LBA of the first + failed sector but ATA/ATAPI standard specifies that the + amount of transferred data on error completion is + indeterminate, so we cannot assume that sectors preceding + the failed sector have been transferred and thus cannot + complete those sectors successfully as SCSI does. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term>Media changed / media change requested error</term> + <listitem> + <para> + <<TODO: fill here>> + </para> + </listitem> + </varlistentry> + + <varlistentry><term>Address error</term> + <listitem> + <para> + This is indicated by IDNF bit in the ERROR register. + Report to upper layer. + </para> + </listitem> + </varlistentry> + + <varlistentry><term>Other errors</term> + <listitem> + <para> + This can be invalid command or parameter indicated by ABRT + ERROR bit or some other error condition. Note that ABRT + bit can indicate a lot of things including ICRC and Address + errors. Heuristics needed. + </para> + </listitem> + </varlistentry> + + </variablelist> + + <para> + Depending on commands, not all STATUS/ERROR bits are + applicable. These non-applicable bits are marked with + "na" in the output descriptions but upto ATA/ATAPI-7 + no definition of "na" can be found. However, + ATA/ATAPI-8 draft revision 1f describes "N/A" as + follows. + </para> + + <blockquote> + <variablelist> + <varlistentry><term>3.2.3.3a N/A</term> + <listitem> + <para> + A keyword the indicates a field has no defined value in + this standard and should not be checked by the host or + device. N/A fields should be cleared to zero. + </para> + </listitem> + </varlistentry> + </variablelist> + </blockquote> + + <para> + So, it seems reasonable to assume that "na" bits are + cleared to zero by devices and thus need no explicit masking. + </para> + + </sect2> + + <sect2 id="excatATAPIcc"> + <title>ATAPI device CHECK CONDITION</title> + + <para> + ATAPI device CHECK CONDITION error is indicated by set CHK bit + (ERR bit) in the STATUS register after the last byte of CDB is + transferred for a PACKET command. For this kind of errors, + sense data should be acquired to gather information regarding + the errors. REQUEST SENSE packet command should be used to + acquire sense data. + </para> + + <para> + Once sense data is acquired, this type of errors can be + handled similary to other SCSI errors. Note that sense data + may indicate ATA bus error (e.g. Sense Key 04h HARDWARE ERROR + && ASC/ASCQ 47h/00h SCSI PARITY ERROR). In such + cases, the error should be considered as an ATA bus error and + handled according to <xref linkend="excatATAbusErr"/>. + </para> + + </sect2> + + <sect2 id="excatNCQerr"> + <title>ATA device error (NCQ)</title> + + <para> + NCQ command error is indicated by cleared BSY and set ERR bit + during NCQ command phase (one or more NCQ commands + outstanding). Although STATUS and ERROR registers will + contain valid values describing the error, READ LOG EXT is + required to clear the error condition, determine which command + has failed and acquire more information. + </para> + + <para> + READ LOG EXT Log Page 10h reports which tag has failed and + taskfile register values describing the error. With this + information the failed command can be handled as a normal ATA + command error as in <xref linkend="excatDevErr"/> and all + other in-flight commands must be retried. Note that this + retry should not be counted - it's likely that commands + retried this way would have completed normally if it were not + for the failed command. + </para> + + <para> + Note that ATA bus errors can be reported as ATA device NCQ + errors. This should be handled as described in <xref + linkend="excatATAbusErr"/>. + </para> + + <para> + If READ LOG EXT Log Page 10h fails or reports NQ, we're + thoroughly screwed. This condition should be treated + according to <xref linkend="excatHSMviolation"/>. + </para> + + </sect2> + + <sect2 id="excatATAbusErr"> + <title>ATA bus error</title> + + <para> + ATA bus error means that data corruption occurred during + transmission over ATA bus (SATA or PATA). This type of errors + can be indicated by + </para> + + <itemizedlist> + + <listitem> + <para> + ICRC or ABRT error as described in <xref linkend="excatDevErr"/>. + </para> + </listitem> + + <listitem> + <para> + Controller-specific error completion with error information + indicating transmission error. + </para> + </listitem> + + <listitem> + <para> + On some controllers, command timeout. In this case, there may + be a mechanism to determine that the timeout is due to + transmission error. + </para> + </listitem> + + <listitem> + <para> + Unknown/random errors, timeouts and all sorts of weirdities. + </para> + </listitem> + + </itemizedlist> + + <para> + As described above, transmission errors can cause wide variety + of symptoms ranging from device ICRC error to random device + lockup, and, for many cases, there is no way to tell if an + error condition is due to transmission error or not; + therefore, it's necessary to employ some kind of heuristic + when dealing with errors and timeouts. For example, + encountering repetitive ABRT errors for known supported + command is likely to indicate ATA bus error. + </para> + + <para> + Once it's determined that ATA bus errors have possibly + occurred, lowering ATA bus transmission speed is one of + actions which may alleviate the problem. See <xref + linkend="exrecReconf"/> for more information. + </para> + + </sect2> + + <sect2 id="excatPCIbusErr"> + <title>PCI bus error</title> + + <para> + Data corruption or other failures during transmission over PCI + (or other system bus). For standard BMDMA, this is indicated + by Error bit in the BMDMA Status register. This type of + errors must be logged as it indicates something is very wrong + with the system. Resetting host controller is recommended. + </para> + + </sect2> + + <sect2 id="excatLateCompletion"> + <title>Late completion</title> + + <para> + This occurs when timeout occurs and the timeout handler finds + out that the timed out command has completed successfully or + with error. This is usually caused by lost interrupts. This + type of errors must be logged. Resetting host controller is + recommended. + </para> + + </sect2> + + <sect2 id="excatUnknown"> + <title>Unknown error (timeout)</title> + + <para> + This is when timeout occurs and the command is still + processing or the host and device are in unknown state. When + this occurs, HSM could be in any valid or invalid state. To + bring the device to known state and make it forget about the + timed out command, resetting is necessary. The timed out + command may be retried. + </para> + + <para> + Timeouts can also be caused by transmission errors. Refer to + <xref linkend="excatATAbusErr"/> for more details. + </para> + + </sect2> + + <sect2 id="excatHoplugPM"> + <title>Hotplug and power management exceptions</title> + + <para> + <<TODO: fill here>> + </para> + + </sect2> + + </sect1> + + <sect1 id="exrec"> + <title>EH recovery actions</title> + + <para> + This section discusses several important recovery actions. + </para> + + <sect2 id="exrecClr"> + <title>Clearing error condition</title> + + <para> + Many controllers require its error registers to be cleared by + error handler. Different controllers may have different + requirements. + </para> + + <para> + For SATA, it's strongly recommended to clear at least SError + register during error handling. + </para> + </sect2> + + <sect2 id="exrecRst"> + <title>Reset</title> + + <para> + During EH, resetting is necessary in the following cases. + </para> + + <itemizedlist> + + <listitem> + <para> + HSM is in unknown or invalid state + </para> + </listitem> + + <listitem> + <para> + HBA is in unknown or invalid state + </para> + </listitem> + + <listitem> + <para> + EH needs to make HBA/device forget about in-flight commands + </para> + </listitem> + + <listitem> + <para> + HBA/device behaves weirdly + </para> + </listitem> + + </itemizedlist> + + <para> + Resetting during EH might be a good idea regardless of error + condition to improve EH robustness. Whether to reset both or + either one of HBA and device depends on situation but the + following scheme is recommended. + </para> + + <itemizedlist> + + <listitem> + <para> + When it's known that HBA is in ready state but ATA/ATAPI + device is in unknown state, reset only device. + </para> + </listitem> + + <listitem> + <para> + If HBA is in unknown state, reset both HBA and device. + </para> + </listitem> + + </itemizedlist> + + <para> + HBA resetting is implementation specific. For a controller + complying to taskfile/BMDMA PCI IDE, stopping active DMA + transaction may be sufficient iff BMDMA state is the only HBA + context. But even mostly taskfile/BMDMA PCI IDE complying + controllers may have implementation specific requirements and + mechanism to reset themselves. This must be addressed by + specific drivers. + </para> + + <para> + OTOH, ATA/ATAPI standard describes in detail ways to reset + ATA/ATAPI devices. + </para> + + <variablelist> + + <varlistentry><term>PATA hardware reset</term> + <listitem> + <para> + This is hardware initiated device reset signalled with + asserted PATA RESET- signal. There is no standard way to + initiate hardware reset from software although some + hardware provides registers that allow driver to directly + tweak the RESET- signal. + </para> + </listitem> + </varlistentry> + + <varlistentry><term>Software reset</term> + <listitem> + <para> + This is achieved by turning CONTROL SRST bit on for at + least 5us. Both PATA and SATA support it but, in case of + SATA, this may require controller-specific support as the + second Register FIS to clear SRST should be transmitted + while BSY bit is still set. Note that on PATA, this resets + both master and slave devices on a channel. + </para> + </listitem> + </varlistentry> + + <varlistentry><term>EXECUTE DEVICE DIAGNOSTIC command</term> + <listitem> + <para> + Although ATA/ATAPI standard doesn't describe exactly, EDD + implies some level of resetting, possibly similar level + with software reset. Host-side EDD protocol can be handled + with normal command processing and most SATA controllers + should be able to handle EDD's just like other commands. + As in software reset, EDD affects both devices on a PATA + bus. + </para> + <para> + Although EDD does reset devices, this doesn't suit error + handling as EDD cannot be issued while BSY is set and it's + unclear how it will act when device is in unknown/weird + state. + </para> + </listitem> + </varlistentry> + + <varlistentry><term>ATAPI DEVICE RESET command</term> + <listitem> + <para> + This is very similar to software reset except that reset + can be restricted to the selected device without affecting + the other device sharing the cable. + </para> + </listitem> + </varlistentry> + + <varlistentry><term>SATA phy reset</term> + <listitem> + <para> + This is the preferred way of resetting a SATA device. In + effect, it's identical to PATA hardware reset. Note that + this can be done with the standard SCR Control register. + As such, it's usually easier to implement than software + reset. + </para> + </listitem> + </varlistentry> + + </variablelist> + + <para> + One more thing to consider when resetting devices is that + resetting clears certain configuration parameters and they + need to be set to their previous or newly adjusted values + after reset. + </para> + + <para> + Parameters affected are. + </para> + + <itemizedlist> + + <listitem> + <para> + CHS set up with INITIALIZE DEVICE PARAMETERS (seldomly used) + </para> + </listitem> + + <listitem> + <para> + Parameters set with SET FEATURES including transfer mode setting + </para> + </listitem> + + <listitem> + <para> + Block count set with SET MULTIPLE MODE + </para> + </listitem> + + <listitem> + <para> + Other parameters (SET MAX, MEDIA LOCK...) + </para> + </listitem> + + </itemizedlist> + + <para> + ATA/ATAPI standard specifies that some parameters must be + maintained across hardware or software reset, but doesn't + strictly specify all of them. Always reconfiguring needed + parameters after reset is required for robustness. Note that + this also applies when resuming from deep sleep (power-off). + </para> + + <para> + Also, ATA/ATAPI standard requires that IDENTIFY DEVICE / + IDENTIFY PACKET DEVICE is issued after any configuration + parameter is updated or a hardware reset and the result used + for further operation. OS driver is required to implement + revalidation mechanism to support this. + </para> + + </sect2> + + <sect2 id="exrecReconf"> + <title>Reconfigure transport</title> + + <para> + For both PATA and SATA, a lot of corners are cut for cheap + connectors, cables or controllers and it's quite common to see + high transmission error rate. This can be mitigated by + lowering transmission speed. + </para> + + <para> + The following is a possible scheme Jeff Garzik suggested. + </para> + + <blockquote> + <para> + If more than $N (3?) transmission errors happen in 15 minutes, + </para> + <itemizedlist> + <listitem> + <para> + if SATA, decrease SATA PHY speed. if speed cannot be decreased, + </para> + </listitem> + <listitem> + <para> + decrease UDMA xfer speed. if at UDMA0, switch to PIO4, + </para> + </listitem> + <listitem> + <para> + decrease PIO xfer speed. if at PIO3, complain, but continue + </para> + </listitem> + </itemizedlist> + </blockquote> + + </sect2> + + </sect1> + + </chapter> + + <chapter id="PiixInt"> + <title>ata_piix Internals</title> +!Idrivers/ata/ata_piix.c + </chapter> + + <chapter id="SILInt"> + <title>sata_sil Internals</title> +!Idrivers/ata/sata_sil.c + </chapter> + + <chapter id="libataThanks"> + <title>Thanks</title> + <para> + The bulk of the ATA knowledge comes thanks to long conversations with + Andre Hedrick (www.linux-ide.org), and long hours pondering the ATA + and SCSI specifications. + </para> + <para> + Thanks to Alan Cox for pointing out similarities + between SATA and SCSI, and in general for motivation to hack on + libata. + </para> + <para> + libata's device detection + method, ata_pio_devchk, and in general all the early probing was + based on extensive study of Hale Landis's probe/reset code in his + ATADRVR driver (www.ata-atapi.com). + </para> + </chapter> + +</book> diff --git a/Documentation/DocBook/librs.tmpl b/Documentation/DocBook/librs.tmpl new file mode 100644 index 0000000..94f2136 --- /dev/null +++ b/Documentation/DocBook/librs.tmpl @@ -0,0 +1,289 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="Reed-Solomon-Library-Guide"> + <bookinfo> + <title>Reed-Solomon Library Programming Interface</title> + + <authorgroup> + <author> + <firstname>Thomas</firstname> + <surname>Gleixner</surname> + <affiliation> + <address> + <email>tglx@linutronix.de</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2004</year> + <holder>Thomas Gleixner</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License version 2 as published by the Free Software Foundation. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <para> + The generic Reed-Solomon Library provides encoding, decoding + and error correction functions. + </para> + <para> + Reed-Solomon codes are used in communication and storage + applications to ensure data integrity. + </para> + <para> + This documentation is provided for developers who want to utilize + the functions provided by the library. + </para> + </chapter> + + <chapter id="bugs"> + <title>Known Bugs And Assumptions</title> + <para> + None. + </para> + </chapter> + + <chapter id="usage"> + <title>Usage</title> + <para> + This chapter provides examples of how to use the library. + </para> + <sect1> + <title>Initializing</title> + <para> + The init function init_rs returns a pointer to an + rs decoder structure, which holds the necessary + information for encoding, decoding and error correction + with the given polynomial. It either uses an existing + matching decoder or creates a new one. On creation all + the lookup tables for fast en/decoding are created. + The function may take a while, so make sure not to + call it in critical code paths. + </para> + <programlisting> +/* the Reed Solomon control structure */ +static struct rs_control *rs_decoder; + +/* Symbolsize is 10 (bits) + * Primitive polynomial is x^10+x^3+1 + * first consecutive root is 0 + * primitive element to generate roots = 1 + * generator polynomial degree (number of roots) = 6 + */ +rs_decoder = init_rs (10, 0x409, 0, 1, 6); + </programlisting> + </sect1> + <sect1> + <title>Encoding</title> + <para> + The encoder calculates the Reed-Solomon code over + the given data length and stores the result in + the parity buffer. Note that the parity buffer must + be initialized before calling the encoder. + </para> + <para> + The expanded data can be inverted on the fly by + providing a non-zero inversion mask. The expanded data is + XOR'ed with the mask. This is used e.g. for FLASH + ECC, where the all 0xFF is inverted to an all 0x00. + The Reed-Solomon code for all 0x00 is all 0x00. The + code is inverted before storing to FLASH so it is 0xFF + too. This prevents that reading from an erased FLASH + results in ECC errors. + </para> + <para> + The databytes are expanded to the given symbol size + on the fly. There is no support for encoding continuous + bitstreams with a symbol size != 8 at the moment. If + it is necessary it should be not a big deal to implement + such functionality. + </para> + <programlisting> +/* Parity buffer. Size = number of roots */ +uint16_t par[6]; +/* Initialize the parity buffer */ +memset(par, 0, sizeof(par)); +/* Encode 512 byte in data8. Store parity in buffer par */ +encode_rs8 (rs_decoder, data8, 512, par, 0); + </programlisting> + </sect1> + <sect1> + <title>Decoding</title> + <para> + The decoder calculates the syndrome over + the given data length and the received parity symbols + and corrects errors in the data. + </para> + <para> + If a syndrome is available from a hardware decoder + then the syndrome calculation is skipped. + </para> + <para> + The correction of the data buffer can be suppressed + by providing a correction pattern buffer and an error + location buffer to the decoder. The decoder stores the + calculated error location and the correction bitmask + in the given buffers. This is useful for hardware + decoders which use a weird bit ordering scheme. + </para> + <para> + The databytes are expanded to the given symbol size + on the fly. There is no support for decoding continuous + bitstreams with a symbolsize != 8 at the moment. If + it is necessary it should be not a big deal to implement + such functionality. + </para> + + <sect2> + <title> + Decoding with syndrome calculation, direct data correction + </title> + <programlisting> +/* Parity buffer. Size = number of roots */ +uint16_t par[6]; +uint8_t data[512]; +int numerr; +/* Receive data */ +..... +/* Receive parity */ +..... +/* Decode 512 byte in data8.*/ +numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL); + </programlisting> + </sect2> + + <sect2> + <title> + Decoding with syndrome given by hardware decoder, direct data correction + </title> + <programlisting> +/* Parity buffer. Size = number of roots */ +uint16_t par[6], syn[6]; +uint8_t data[512]; +int numerr; +/* Receive data */ +..... +/* Receive parity */ +..... +/* Get syndrome from hardware decoder */ +..... +/* Decode 512 byte in data8.*/ +numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL); + </programlisting> + </sect2> + + <sect2> + <title> + Decoding with syndrome given by hardware decoder, no direct data correction. + </title> + <para> + Note: It's not necessary to give data and received parity to the decoder. + </para> + <programlisting> +/* Parity buffer. Size = number of roots */ +uint16_t par[6], syn[6], corr[8]; +uint8_t data[512]; +int numerr, errpos[8]; +/* Receive data */ +..... +/* Receive parity */ +..... +/* Get syndrome from hardware decoder */ +..... +/* Decode 512 byte in data8.*/ +numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr); +for (i = 0; i < numerr; i++) { + do_error_correction_in_your_buffer(errpos[i], corr[i]); +} + </programlisting> + </sect2> + </sect1> + <sect1> + <title>Cleanup</title> + <para> + The function free_rs frees the allocated resources, + if the caller is the last user of the decoder. + </para> + <programlisting> +/* Release resources */ +free_rs(rs_decoder); + </programlisting> + </sect1> + + </chapter> + + <chapter id="structs"> + <title>Structures</title> + <para> + This chapter contains the autogenerated documentation of the structures which are + used in the Reed-Solomon Library and are relevant for a developer. + </para> +!Iinclude/linux/rslib.h + </chapter> + + <chapter id="pubfunctions"> + <title>Public Functions Provided</title> + <para> + This chapter contains the autogenerated documentation of the Reed-Solomon functions + which are exported. + </para> +!Elib/reed_solomon/reed_solomon.c + </chapter> + + <chapter id="credits"> + <title>Credits</title> + <para> + The library code for encoding and decoding was written by Phil Karn. + </para> + <programlisting> + Copyright 2002, Phil Karn, KA9Q + May be used under the terms of the GNU General Public License (GPL) + </programlisting> + <para> + The wrapper functions and interfaces are written by Thomas Gleixner. + </para> + <para> + Many users have provided bugfixes, improvements and helping hands for testing. + Thanks a lot. + </para> + <para> + The following people have contributed to this document: + </para> + <para> + Thomas Gleixner<email>tglx@linutronix.de</email> + </para> + </chapter> +</book> diff --git a/Documentation/DocBook/lsm.tmpl b/Documentation/DocBook/lsm.tmpl new file mode 100644 index 0000000..fe7664c --- /dev/null +++ b/Documentation/DocBook/lsm.tmpl @@ -0,0 +1,265 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<article class="whitepaper" id="LinuxSecurityModule" lang="en"> + <articleinfo> + <title>Linux Security Modules: General Security Hooks for Linux</title> + <authorgroup> + <author> + <firstname>Stephen</firstname> + <surname>Smalley</surname> + <affiliation> + <orgname>NAI Labs</orgname> + <address><email>ssmalley@nai.com</email></address> + </affiliation> + </author> + <author> + <firstname>Timothy</firstname> + <surname>Fraser</surname> + <affiliation> + <orgname>NAI Labs</orgname> + <address><email>tfraser@nai.com</email></address> + </affiliation> + </author> + <author> + <firstname>Chris</firstname> + <surname>Vance</surname> + <affiliation> + <orgname>NAI Labs</orgname> + <address><email>cvance@nai.com</email></address> + </affiliation> + </author> + </authorgroup> + </articleinfo> + +<sect1 id="Introduction"><title>Introduction</title> + +<para> +In March 2001, the National Security Agency (NSA) gave a presentation +about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel +Summit. SELinux is an implementation of flexible and fine-grained +nondiscretionary access controls in the Linux kernel, originally +implemented as its own particular kernel patch. Several other +security projects (e.g. RSBAC, Medusa) have also developed flexible +access control architectures for the Linux kernel, and various +projects have developed particular access control models for Linux +(e.g. LIDS, DTE, SubDomain). Each project has developed and +maintained its own kernel patch to support its security needs. +</para> + +<para> +In response to the NSA presentation, Linus Torvalds made a set of +remarks that described a security framework he would be willing to +consider for inclusion in the mainstream Linux kernel. He described a +general framework that would provide a set of security hooks to +control operations on kernel objects and a set of opaque security +fields in kernel data structures for maintaining security attributes. +This framework could then be used by loadable kernel modules to +implement any desired model of security. Linus also suggested the +possibility of migrating the Linux capabilities code into such a +module. +</para> + +<para> +The Linux Security Modules (LSM) project was started by WireX to +develop such a framework. LSM is a joint development effort by +several security projects, including Immunix, SELinux, SGI and Janus, +and several individuals, including Greg Kroah-Hartman and James +Morris, to develop a Linux kernel patch that implements this +framework. The patch is currently tracking the 2.4 series and is +targeted for integration into the 2.5 development series. This +technical report provides an overview of the framework and the example +capabilities security module provided by the LSM kernel patch. +</para> + +</sect1> + +<sect1 id="framework"><title>LSM Framework</title> + +<para> +The LSM kernel patch provides a general kernel framework to support +security modules. In particular, the LSM framework is primarily +focused on supporting access control modules, although future +development is likely to address other security needs such as +auditing. By itself, the framework does not provide any additional +security; it merely provides the infrastructure to support security +modules. The LSM kernel patch also moves most of the capabilities +logic into an optional security module, with the system defaulting +to the traditional superuser logic. This capabilities module +is discussed further in <xref linkend="cap"/>. +</para> + +<para> +The LSM kernel patch adds security fields to kernel data structures +and inserts calls to hook functions at critical points in the kernel +code to manage the security fields and to perform access control. It +also adds functions for registering and unregistering security +modules, and adds a general <function>security</function> system call +to support new system calls for security-aware applications. +</para> + +<para> +The LSM security fields are simply <type>void*</type> pointers. For +process and program execution security information, security fields +were added to <structname>struct task_struct</structname> and +<structname>struct linux_binprm</structname>. For filesystem security +information, a security field was added to +<structname>struct super_block</structname>. For pipe, file, and socket +security information, security fields were added to +<structname>struct inode</structname> and +<structname>struct file</structname>. For packet and network device security +information, security fields were added to +<structname>struct sk_buff</structname> and +<structname>struct net_device</structname>. For System V IPC security +information, security fields were added to +<structname>struct kern_ipc_perm</structname> and +<structname>struct msg_msg</structname>; additionally, the definitions +for <structname>struct msg_msg</structname>, <structname>struct +msg_queue</structname>, and <structname>struct +shmid_kernel</structname> were moved to header files +(<filename>include/linux/msg.h</filename> and +<filename>include/linux/shm.h</filename> as appropriate) to allow +the security modules to use these definitions. +</para> + +<para> +Each LSM hook is a function pointer in a global table, +security_ops. This table is a +<structname>security_operations</structname> structure as defined by +<filename>include/linux/security.h</filename>. Detailed documentation +for each hook is included in this header file. At present, this +structure consists of a collection of substructures that group related +hooks based on the kernel object (e.g. task, inode, file, sk_buff, +etc) as well as some top-level hook function pointers for system +operations. This structure is likely to be flattened in the future +for performance. The placement of the hook calls in the kernel code +is described by the "called:" lines in the per-hook documentation in +the header file. The hook calls can also be easily found in the +kernel code by looking for the string "security_ops->". + +</para> + +<para> +Linus mentioned per-process security hooks in his original remarks as a +possible alternative to global security hooks. However, if LSM were +to start from the perspective of per-process hooks, then the base +framework would have to deal with how to handle operations that +involve multiple processes (e.g. kill), since each process might have +its own hook for controlling the operation. This would require a +general mechanism for composing hooks in the base framework. +Additionally, LSM would still need global hooks for operations that +have no process context (e.g. network input operations). +Consequently, LSM provides global security hooks, but a security +module is free to implement per-process hooks (where that makes sense) +by storing a security_ops table in each process' security field and +then invoking these per-process hooks from the global hooks. +The problem of composition is thus deferred to the module. +</para> + +<para> +The global security_ops table is initialized to a set of hook +functions provided by a dummy security module that provides +traditional superuser logic. A <function>register_security</function> +function (in <filename>security/security.c</filename>) is provided to +allow a security module to set security_ops to refer to its own hook +functions, and an <function>unregister_security</function> function is +provided to revert security_ops to the dummy module hooks. This +mechanism is used to set the primary security module, which is +responsible for making the final decision for each hook. +</para> + +<para> +LSM also provides a simple mechanism for stacking additional security +modules with the primary security module. It defines +<function>register_security</function> and +<function>unregister_security</function> hooks in the +<structname>security_operations</structname> structure and provides +<function>mod_reg_security</function> and +<function>mod_unreg_security</function> functions that invoke these +hooks after performing some sanity checking. A security module can +call these functions in order to stack with other modules. However, +the actual details of how this stacking is handled are deferred to the +module, which can implement these hooks in any way it wishes +(including always returning an error if it does not wish to support +stacking). In this manner, LSM again defers the problem of +composition to the module. +</para> + +<para> +Although the LSM hooks are organized into substructures based on +kernel object, all of the hooks can be viewed as falling into two +major categories: hooks that are used to manage the security fields +and hooks that are used to perform access control. Examples of the +first category of hooks include the +<function>alloc_security</function> and +<function>free_security</function> hooks defined for each kernel data +structure that has a security field. These hooks are used to allocate +and free security structures for kernel objects. The first category +of hooks also includes hooks that set information in the security +field after allocation, such as the <function>post_lookup</function> +hook in <structname>struct inode_security_ops</structname>. This hook +is used to set security information for inodes after successful lookup +operations. An example of the second category of hooks is the +<function>permission</function> hook in +<structname>struct inode_security_ops</structname>. This hook checks +permission when accessing an inode. +</para> + +</sect1> + +<sect1 id="cap"><title>LSM Capabilities Module</title> + +<para> +The LSM kernel patch moves most of the existing POSIX.1e capabilities +logic into an optional security module stored in the file +<filename>security/capability.c</filename>. This change allows +users who do not want to use capabilities to omit this code entirely +from their kernel, instead using the dummy module for traditional +superuser logic or any other module that they desire. This change +also allows the developers of the capabilities logic to maintain and +enhance their code more freely, without needing to integrate patches +back into the base kernel. +</para> + +<para> +In addition to moving the capabilities logic, the LSM kernel patch +could move the capability-related fields from the kernel data +structures into the new security fields managed by the security +modules. However, at present, the LSM kernel patch leaves the +capability fields in the kernel data structures. In his original +remarks, Linus suggested that this might be preferable so that other +security modules can be easily stacked with the capabilities module +without needing to chain multiple security structures on the security field. +It also avoids imposing extra overhead on the capabilities module +to manage the security fields. However, the LSM framework could +certainly support such a move if it is determined to be desirable, +with only a few additional changes described below. +</para> + +<para> +At present, the capabilities logic for computing process capabilities +on <function>execve</function> and <function>set*uid</function>, +checking capabilities for a particular process, saving and checking +capabilities for netlink messages, and handling the +<function>capget</function> and <function>capset</function> system +calls have been moved into the capabilities module. There are still a +few locations in the base kernel where capability-related fields are +directly examined or modified, but the current version of the LSM +patch does allow a security module to completely replace the +assignment and testing of capabilities. These few locations would +need to be changed if the capability-related fields were moved into +the security field. The following is a list of known locations that +still perform such direct examination or modification of +capability-related fields: +<itemizedlist> +<listitem><para><filename>fs/open.c</filename>:<function>sys_access</function></para></listitem> +<listitem><para><filename>fs/lockd/host.c</filename>:<function>nlm_bind_host</function></para></listitem> +<listitem><para><filename>fs/nfsd/auth.c</filename>:<function>nfsd_setuser</function></para></listitem> +<listitem><para><filename>fs/proc/array.c</filename>:<function>task_cap</function></para></listitem> +</itemizedlist> +</para> + +</sect1> + +</article> diff --git a/Documentation/DocBook/mac80211.tmpl b/Documentation/DocBook/mac80211.tmpl new file mode 100644 index 0000000..77c3c20 --- /dev/null +++ b/Documentation/DocBook/mac80211.tmpl @@ -0,0 +1,331 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="mac80211-developers-guide"> + <bookinfo> + <title>The mac80211 subsystem for kernel developers</title> + + <authorgroup> + <author> + <firstname>Johannes</firstname> + <surname>Berg</surname> + <affiliation> + <address><email>johannes@sipsolutions.net</email></address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2007</year> + <year>2008</year> + <holder>Johannes Berg</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License version 2 as published by the Free Software Foundation. + </para> + + <para> + This documentation is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this documentation; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + + <abstract> +!Pinclude/net/mac80211.h Introduction +!Pinclude/net/mac80211.h Warning + </abstract> + </bookinfo> + + <toc></toc> + +<!-- +Generally, this document shall be ordered by increasing complexity. +It is important to note that readers should be able to read only +the first few sections to get a working driver and only advanced +usage should require reading the full document. +--> + + <part> + <title>The basic mac80211 driver interface</title> + <partintro> + <para> + You should read and understand the information contained + within this part of the book while implementing a driver. + In some chapters, advanced usage is noted, that may be + skipped at first. + </para> + <para> + This part of the book only covers station and monitor mode + functionality, additional information required to implement + the other modes is covered in the second part of the book. + </para> + </partintro> + + <chapter id="basics"> + <title>Basic hardware handling</title> + <para>TBD</para> + <para> + This chapter shall contain information on getting a hw + struct allocated and registered with mac80211. + </para> + <para> + Since it is required to allocate rates/modes before registering + a hw struct, this chapter shall also contain information on setting + up the rate/mode structs. + </para> + <para> + Additionally, some discussion about the callbacks and + the general programming model should be in here, including + the definition of ieee80211_ops which will be referred to + a lot. + </para> + <para> + Finally, a discussion of hardware capabilities should be done + with references to other parts of the book. + </para> +<!-- intentionally multiple !F lines to get proper order --> +!Finclude/net/mac80211.h ieee80211_hw +!Finclude/net/mac80211.h ieee80211_hw_flags +!Finclude/net/mac80211.h SET_IEEE80211_DEV +!Finclude/net/mac80211.h SET_IEEE80211_PERM_ADDR +!Finclude/net/mac80211.h ieee80211_ops +!Finclude/net/mac80211.h ieee80211_alloc_hw +!Finclude/net/mac80211.h ieee80211_register_hw +!Finclude/net/mac80211.h ieee80211_get_tx_led_name +!Finclude/net/mac80211.h ieee80211_get_rx_led_name +!Finclude/net/mac80211.h ieee80211_get_assoc_led_name +!Finclude/net/mac80211.h ieee80211_get_radio_led_name +!Finclude/net/mac80211.h ieee80211_unregister_hw +!Finclude/net/mac80211.h ieee80211_free_hw + </chapter> + + <chapter id="phy-handling"> + <title>PHY configuration</title> + <para>TBD</para> + <para> + This chapter should describe PHY handling including + start/stop callbacks and the various structures used. + </para> +!Finclude/net/mac80211.h ieee80211_conf +!Finclude/net/mac80211.h ieee80211_conf_flags + </chapter> + + <chapter id="iface-handling"> + <title>Virtual interfaces</title> + <para>TBD</para> + <para> + This chapter should describe virtual interface basics + that are relevant to the driver (VLANs, MGMT etc are not.) + It should explain the use of the add_iface/remove_iface + callbacks as well as the interface configuration callbacks. + </para> + <para>Things related to AP mode should be discussed there.</para> + <para> + Things related to supporting multiple interfaces should be + in the appropriate chapter, a BIG FAT note should be here about + this though and the recommendation to allow only a single + interface in STA mode at first! + </para> +!Finclude/net/mac80211.h ieee80211_if_init_conf +!Finclude/net/mac80211.h ieee80211_if_conf + </chapter> + + <chapter id="rx-tx"> + <title>Receive and transmit processing</title> + <sect1> + <title>what should be here</title> + <para>TBD</para> + <para> + This should describe the receive and transmit + paths in mac80211/the drivers as well as + transmit status handling. + </para> + </sect1> + <sect1> + <title>Frame format</title> +!Pinclude/net/mac80211.h Frame format + </sect1> + <sect1> + <title>Alignment issues</title> + <para>TBD</para> + </sect1> + <sect1> + <title>Calling into mac80211 from interrupts</title> +!Pinclude/net/mac80211.h Calling mac80211 from interrupts + </sect1> + <sect1> + <title>functions/definitions</title> +!Finclude/net/mac80211.h ieee80211_rx_status +!Finclude/net/mac80211.h mac80211_rx_flags +!Finclude/net/mac80211.h ieee80211_tx_info +!Finclude/net/mac80211.h ieee80211_rx +!Finclude/net/mac80211.h ieee80211_rx_irqsafe +!Finclude/net/mac80211.h ieee80211_tx_status +!Finclude/net/mac80211.h ieee80211_tx_status_irqsafe +!Finclude/net/mac80211.h ieee80211_rts_get +!Finclude/net/mac80211.h ieee80211_rts_duration +!Finclude/net/mac80211.h ieee80211_ctstoself_get +!Finclude/net/mac80211.h ieee80211_ctstoself_duration +!Finclude/net/mac80211.h ieee80211_generic_frame_duration +!Finclude/net/mac80211.h ieee80211_get_hdrlen_from_skb +!Finclude/net/mac80211.h ieee80211_hdrlen +!Finclude/net/mac80211.h ieee80211_wake_queue +!Finclude/net/mac80211.h ieee80211_stop_queue +!Finclude/net/mac80211.h ieee80211_wake_queues +!Finclude/net/mac80211.h ieee80211_stop_queues + </sect1> + </chapter> + + <chapter id="filters"> + <title>Frame filtering</title> +!Pinclude/net/mac80211.h Frame filtering +!Finclude/net/mac80211.h ieee80211_filter_flags + </chapter> + </part> + + <part id="advanced"> + <title>Advanced driver interface</title> + <partintro> + <para> + Information contained within this part of the book is + of interest only for advanced interaction of mac80211 + with drivers to exploit more hardware capabilities and + improve performance. + </para> + </partintro> + + <chapter id="hardware-crypto-offload"> + <title>Hardware crypto acceleration</title> +!Pinclude/net/mac80211.h Hardware crypto acceleration +<!-- intentionally multiple !F lines to get proper order --> +!Finclude/net/mac80211.h set_key_cmd +!Finclude/net/mac80211.h ieee80211_key_conf +!Finclude/net/mac80211.h ieee80211_key_alg +!Finclude/net/mac80211.h ieee80211_key_flags + </chapter> + + <chapter id="qos"> + <title>Multiple queues and QoS support</title> + <para>TBD</para> +!Finclude/net/mac80211.h ieee80211_tx_queue_params +!Finclude/net/mac80211.h ieee80211_tx_queue_stats + </chapter> + + <chapter id="AP"> + <title>Access point mode support</title> + <para>TBD</para> + <para>Some parts of the if_conf should be discussed here instead</para> + <para> + Insert notes about VLAN interfaces with hw crypto here or + in the hw crypto chapter. + </para> +!Finclude/net/mac80211.h ieee80211_get_buffered_bc +!Finclude/net/mac80211.h ieee80211_beacon_get + </chapter> + + <chapter id="multi-iface"> + <title>Supporting multiple virtual interfaces</title> + <para>TBD</para> + <para> + Note: WDS with identical MAC address should almost always be OK + </para> + <para> + Insert notes about having multiple virtual interfaces with + different MAC addresses here, note which configurations are + supported by mac80211, add notes about supporting hw crypto + with it. + </para> + </chapter> + + <chapter id="hardware-scan-offload"> + <title>Hardware scan offload</title> + <para>TBD</para> +!Finclude/net/mac80211.h ieee80211_scan_completed + </chapter> + </part> + + <part id="rate-control"> + <title>Rate control interface</title> + <partintro> + <para>TBD</para> + <para> + This part of the book describes the rate control algorithm + interface and how it relates to mac80211 and drivers. + </para> + </partintro> + <chapter id="dummy"> + <title>dummy chapter</title> + <para>TBD</para> + </chapter> + </part> + + <part id="internal"> + <title>Internals</title> + <partintro> + <para>TBD</para> + <para> + This part of the book describes mac80211 internals. + </para> + </partintro> + + <chapter id="key-handling"> + <title>Key handling</title> + <sect1> + <title>Key handling basics</title> +!Pnet/mac80211/key.c Key handling basics + </sect1> + <sect1> + <title>MORE TBD</title> + <para>TBD</para> + </sect1> + </chapter> + + <chapter id="rx-processing"> + <title>Receive processing</title> + <para>TBD</para> + </chapter> + + <chapter id="tx-processing"> + <title>Transmit processing</title> + <para>TBD</para> + </chapter> + + <chapter id="sta-info"> + <title>Station info handling</title> + <sect1> + <title>Programming information</title> +!Fnet/mac80211/sta_info.h sta_info +!Fnet/mac80211/sta_info.h ieee80211_sta_info_flags + </sect1> + <sect1> + <title>STA information lifetime rules</title> +!Pnet/mac80211/sta_info.c STA information lifetime rules + </sect1> + </chapter> + + <chapter id="synchronisation"> + <title>Synchronisation</title> + <para>TBD</para> + <para>Locking, lots of RCU</para> + </chapter> + </part> +</book> diff --git a/Documentation/DocBook/mcabook.tmpl b/Documentation/DocBook/mcabook.tmpl new file mode 100644 index 0000000..467ccac --- /dev/null +++ b/Documentation/DocBook/mcabook.tmpl @@ -0,0 +1,107 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="MCAGuide"> + <bookinfo> + <title>MCA Driver Programming Interface</title> + + <authorgroup> + <author> + <firstname>Alan</firstname> + <surname>Cox</surname> + <affiliation> + <address> + <email>alan@lxorguk.ukuu.org.uk</email> + </address> + </affiliation> + </author> + <author> + <firstname>David</firstname> + <surname>Weinehall</surname> + </author> + <author> + <firstname>Chris</firstname> + <surname>Beauregard</surname> + </author> + </authorgroup> + + <copyright> + <year>2000</year> + <holder>Alan Cox</holder> + <holder>David Weinehall</holder> + <holder>Chris Beauregard</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <para> + The MCA bus functions provide a generalised interface to find MCA + bus cards, to claim them for a driver, and to read and manipulate POS + registers without being aware of the motherboard internals or + certain deep magic specific to onboard devices. + </para> + <para> + The basic interface to the MCA bus devices is the slot. Each slot + is numbered and virtual slot numbers are assigned to the internal + devices. Using a pci_dev as other busses do does not really make + sense in the MCA context as the MCA bus resources require card + specific interpretation. + </para> + <para> + Finally the MCA bus functions provide a parallel set of DMA + functions mimicing the ISA bus DMA functions as closely as possible, + although also supporting the additional DMA functionality on the + MCA bus controllers. + </para> + </chapter> + <chapter id="bugs"> + <title>Known Bugs And Assumptions</title> + <para> + None. + </para> + </chapter> + + <chapter id="pubfunctions"> + <title>Public Functions Provided</title> +!Edrivers/mca/mca-legacy.c + </chapter> + + <chapter id="dmafunctions"> + <title>DMA Functions Provided</title> +!Iarch/x86/include/asm/mca_dma.h + </chapter> + +</book> diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl new file mode 100644 index 0000000..8e14585 --- /dev/null +++ b/Documentation/DocBook/mtdnand.tmpl @@ -0,0 +1,1318 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="MTD-NAND-Guide"> + <bookinfo> + <title>MTD NAND Driver Programming Interface</title> + + <authorgroup> + <author> + <firstname>Thomas</firstname> + <surname>Gleixner</surname> + <affiliation> + <address> + <email>tglx@linutronix.de</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2004</year> + <holder>Thomas Gleixner</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License version 2 as published by the Free Software Foundation. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <para> + The generic NAND driver supports almost all NAND and AG-AND based + chips and connects them to the Memory Technology Devices (MTD) + subsystem of the Linux Kernel. + </para> + <para> + This documentation is provided for developers who want to implement + board drivers or filesystem drivers suitable for NAND devices. + </para> + </chapter> + + <chapter id="bugs"> + <title>Known Bugs And Assumptions</title> + <para> + None. + </para> + </chapter> + + <chapter id="dochints"> + <title>Documentation hints</title> + <para> + The function and structure docs are autogenerated. Each function and + struct member has a short description which is marked with an [XXX] identifier. + The following chapters explain the meaning of those identifiers. + </para> + <sect1 id="Function_identifiers_XXX"> + <title>Function identifiers [XXX]</title> + <para> + The functions are marked with [XXX] identifiers in the short + comment. The identifiers explain the usage and scope of the + functions. Following identifiers are used: + </para> + <itemizedlist> + <listitem><para> + [MTD Interface]</para><para> + These functions provide the interface to the MTD kernel API. + They are not replacable and provide functionality + which is complete hardware independent. + </para></listitem> + <listitem><para> + [NAND Interface]</para><para> + These functions are exported and provide the interface to the NAND kernel API. + </para></listitem> + <listitem><para> + [GENERIC]</para><para> + Generic functions are not replacable and provide functionality + which is complete hardware independent. + </para></listitem> + <listitem><para> + [DEFAULT]</para><para> + Default functions provide hardware related functionality which is suitable + for most of the implementations. These functions can be replaced by the + board driver if neccecary. Those functions are called via pointers in the + NAND chip description structure. The board driver can set the functions which + should be replaced by board dependent functions before calling nand_scan(). + If the function pointer is NULL on entry to nand_scan() then the pointer + is set to the default function which is suitable for the detected chip type. + </para></listitem> + </itemizedlist> + </sect1> + <sect1 id="Struct_member_identifiers_XXX"> + <title>Struct member identifiers [XXX]</title> + <para> + The struct members are marked with [XXX] identifiers in the + comment. The identifiers explain the usage and scope of the + members. Following identifiers are used: + </para> + <itemizedlist> + <listitem><para> + [INTERN]</para><para> + These members are for NAND driver internal use only and must not be + modified. Most of these values are calculated from the chip geometry + information which is evaluated during nand_scan(). + </para></listitem> + <listitem><para> + [REPLACEABLE]</para><para> + Replaceable members hold hardware related functions which can be + provided by the board driver. The board driver can set the functions which + should be replaced by board dependent functions before calling nand_scan(). + If the function pointer is NULL on entry to nand_scan() then the pointer + is set to the default function which is suitable for the detected chip type. + </para></listitem> + <listitem><para> + [BOARDSPECIFIC]</para><para> + Board specific members hold hardware related information which must + be provided by the board driver. The board driver must set the function + pointers and datafields before calling nand_scan(). + </para></listitem> + <listitem><para> + [OPTIONAL]</para><para> + Optional members can hold information relevant for the board driver. The + generic NAND driver code does not use this information. + </para></listitem> + </itemizedlist> + </sect1> + </chapter> + + <chapter id="basicboarddriver"> + <title>Basic board driver</title> + <para> + For most boards it will be sufficient to provide just the + basic functions and fill out some really board dependent + members in the nand chip description structure. + </para> + <sect1 id="Basic_defines"> + <title>Basic defines</title> + <para> + At least you have to provide a mtd structure and + a storage for the ioremap'ed chip address. + You can allocate the mtd structure using kmalloc + or you can allocate it statically. + In case of static allocation you have to allocate + a nand_chip structure too. + </para> + <para> + Kmalloc based example + </para> + <programlisting> +static struct mtd_info *board_mtd; +static unsigned long baseaddr; + </programlisting> + <para> + Static example + </para> + <programlisting> +static struct mtd_info board_mtd; +static struct nand_chip board_chip; +static unsigned long baseaddr; + </programlisting> + </sect1> + <sect1 id="Partition_defines"> + <title>Partition defines</title> + <para> + If you want to divide your device into partitions, then + enable the configuration switch CONFIG_MTD_PARTITIONS and define + a partitioning scheme suitable to your board. + </para> + <programlisting> +#define NUM_PARTITIONS 2 +static struct mtd_partition partition_info[] = { + { .name = "Flash partition 1", + .offset = 0, + .size = 8 * 1024 * 1024 }, + { .name = "Flash partition 2", + .offset = MTDPART_OFS_NEXT, + .size = MTDPART_SIZ_FULL }, +}; + </programlisting> + </sect1> + <sect1 id="Hardware_control_functions"> + <title>Hardware control function</title> + <para> + The hardware control function provides access to the + control pins of the NAND chip(s). + The access can be done by GPIO pins or by address lines. + If you use address lines, make sure that the timing + requirements are met. + </para> + <para> + <emphasis>GPIO based example</emphasis> + </para> + <programlisting> +static void board_hwcontrol(struct mtd_info *mtd, int cmd) +{ + switch(cmd){ + case NAND_CTL_SETCLE: /* Set CLE pin high */ break; + case NAND_CTL_CLRCLE: /* Set CLE pin low */ break; + case NAND_CTL_SETALE: /* Set ALE pin high */ break; + case NAND_CTL_CLRALE: /* Set ALE pin low */ break; + case NAND_CTL_SETNCE: /* Set nCE pin low */ break; + case NAND_CTL_CLRNCE: /* Set nCE pin high */ break; + } +} + </programlisting> + <para> + <emphasis>Address lines based example.</emphasis> It's assumed that the + nCE pin is driven by a chip select decoder. + </para> + <programlisting> +static void board_hwcontrol(struct mtd_info *mtd, int cmd) +{ + struct nand_chip *this = (struct nand_chip *) mtd->priv; + switch(cmd){ + case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break; + case NAND_CTL_CLRCLE: this->IO_ADDR_W &= ~CLE_ADRR_BIT; break; + case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT; break; + case NAND_CTL_CLRALE: this->IO_ADDR_W &= ~ALE_ADRR_BIT; break; + } +} + </programlisting> + </sect1> + <sect1 id="Device_ready_function"> + <title>Device ready function</title> + <para> + If the hardware interface has the ready busy pin of the NAND chip connected to a + GPIO or other accesible I/O pin, this function is used to read back the state of the + pin. The function has no arguments and should return 0, if the device is busy (R/B pin + is low) and 1, if the device is ready (R/B pin is high). + If the hardware interface does not give access to the ready busy pin, then + the function must not be defined and the function pointer this->dev_ready is set to NULL. + </para> + </sect1> + <sect1 id="Init_function"> + <title>Init function</title> + <para> + The init function allocates memory and sets up all the board + specific parameters and function pointers. When everything + is set up nand_scan() is called. This function tries to + detect and identify then chip. If a chip is found all the + internal data fields are initialized accordingly. + The structure(s) have to be zeroed out first and then filled with the neccecary + information about the device. + </para> + <programlisting> +int __init board_init (void) +{ + struct nand_chip *this; + int err = 0; + + /* Allocate memory for MTD device structure and private data */ + board_mtd = kzalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL); + if (!board_mtd) { + printk ("Unable to allocate NAND MTD device structure.\n"); + err = -ENOMEM; + goto out; + } + + /* map physical address */ + baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024); + if(!baseaddr){ + printk("Ioremap to access NAND chip failed\n"); + err = -EIO; + goto out_mtd; + } + + /* Get pointer to private data */ + this = (struct nand_chip *) (); + /* Link the private data with the MTD structure */ + board_mtd->priv = this; + + /* Set address of NAND IO lines */ + this->IO_ADDR_R = baseaddr; + this->IO_ADDR_W = baseaddr; + /* Reference hardware control function */ + this->hwcontrol = board_hwcontrol; + /* Set command delay time, see datasheet for correct value */ + this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY; + /* Assign the device ready function, if available */ + this->dev_ready = board_dev_ready; + this->eccmode = NAND_ECC_SOFT; + + /* Scan to find existence of the device */ + if (nand_scan (board_mtd, 1)) { + err = -ENXIO; + goto out_ior; + } + + add_mtd_partitions(board_mtd, partition_info, NUM_PARTITIONS); + goto out; + +out_ior: + iounmap((void *)baseaddr); +out_mtd: + kfree (board_mtd); +out: + return err; +} +module_init(board_init); + </programlisting> + </sect1> + <sect1 id="Exit_function"> + <title>Exit function</title> + <para> + The exit function is only neccecary if the driver is + compiled as a module. It releases all resources which + are held by the chip driver and unregisters the partitions + in the MTD layer. + </para> + <programlisting> +#ifdef MODULE +static void __exit board_cleanup (void) +{ + /* Release resources, unregister device */ + nand_release (board_mtd); + + /* unmap physical address */ + iounmap((void *)baseaddr); + + /* Free the MTD device structure */ + kfree (board_mtd); +} +module_exit(board_cleanup); +#endif + </programlisting> + </sect1> + </chapter> + + <chapter id="boarddriversadvanced"> + <title>Advanced board driver functions</title> + <para> + This chapter describes the advanced functionality of the NAND + driver. For a list of functions which can be overridden by the board + driver see the documentation of the nand_chip structure. + </para> + <sect1 id="Multiple_chip_control"> + <title>Multiple chip control</title> + <para> + The nand driver can control chip arrays. Therefor the + board driver must provide an own select_chip function. This + function must (de)select the requested chip. + The function pointer in the nand_chip structure must + be set before calling nand_scan(). The maxchip parameter + of nand_scan() defines the maximum number of chips to + scan for. Make sure that the select_chip function can + handle the requested number of chips. + </para> + <para> + The nand driver concatenates the chips to one virtual + chip and provides this virtual chip to the MTD layer. + </para> + <para> + <emphasis>Note: The driver can only handle linear chip arrays + of equally sized chips. There is no support for + parallel arrays which extend the buswidth.</emphasis> + </para> + <para> + <emphasis>GPIO based example</emphasis> + </para> + <programlisting> +static void board_select_chip (struct mtd_info *mtd, int chip) +{ + /* Deselect all chips, set all nCE pins high */ + GPIO(BOARD_NAND_NCE) |= 0xff; + if (chip >= 0) + GPIO(BOARD_NAND_NCE) &= ~ (1 << chip); +} + </programlisting> + <para> + <emphasis>Address lines based example.</emphasis> + Its assumed that the nCE pins are connected to an + address decoder. + </para> + <programlisting> +static void board_select_chip (struct mtd_info *mtd, int chip) +{ + struct nand_chip *this = (struct nand_chip *) mtd->priv; + + /* Deselect all chips */ + this->IO_ADDR_R &= ~BOARD_NAND_ADDR_MASK; + this->IO_ADDR_W &= ~BOARD_NAND_ADDR_MASK; + switch (chip) { + case 0: + this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0; + this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0; + break; + .... + case n: + this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn; + this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn; + break; + } +} + </programlisting> + </sect1> + <sect1 id="Hardware_ECC_support"> + <title>Hardware ECC support</title> + <sect2 id="Functions_and_constants"> + <title>Functions and constants</title> + <para> + The nand driver supports three different types of + hardware ECC. + <itemizedlist> + <listitem><para>NAND_ECC_HW3_256</para><para> + Hardware ECC generator providing 3 bytes ECC per + 256 byte. + </para> </listitem> + <listitem><para>NAND_ECC_HW3_512</para><para> + Hardware ECC generator providing 3 bytes ECC per + 512 byte. + </para> </listitem> + <listitem><para>NAND_ECC_HW6_512</para><para> + Hardware ECC generator providing 6 bytes ECC per + 512 byte. + </para> </listitem> + <listitem><para>NAND_ECC_HW8_512</para><para> + Hardware ECC generator providing 6 bytes ECC per + 512 byte. + </para> </listitem> + </itemizedlist> + If your hardware generator has a different functionality + add it at the appropriate place in nand_base.c + </para> + <para> + The board driver must provide following functions: + <itemizedlist> + <listitem><para>enable_hwecc</para><para> + This function is called before reading / writing to + the chip. Reset or initialize the hardware generator + in this function. The function is called with an + argument which let you distinguish between read + and write operations. + </para> </listitem> + <listitem><para>calculate_ecc</para><para> + This function is called after read / write from / to + the chip. Transfer the ECC from the hardware to + the buffer. If the option NAND_HWECC_SYNDROME is set + then the function is only called on write. See below. + </para> </listitem> + <listitem><para>correct_data</para><para> + In case of an ECC error this function is called for + error detection and correction. Return 1 respectively 2 + in case the error can be corrected. If the error is + not correctable return -1. If your hardware generator + matches the default algorithm of the nand_ecc software + generator then use the correction function provided + by nand_ecc instead of implementing duplicated code. + </para> </listitem> + </itemizedlist> + </para> + </sect2> + <sect2 id="Hardware_ECC_with_syndrome_calculation"> + <title>Hardware ECC with syndrome calculation</title> + <para> + Many hardware ECC implementations provide Reed-Solomon + codes and calculate an error syndrome on read. The syndrome + must be converted to a standard Reed-Solomon syndrome + before calling the error correction code in the generic + Reed-Solomon library. + </para> + <para> + The ECC bytes must be placed immidiately after the data + bytes in order to make the syndrome generator work. This + is contrary to the usual layout used by software ECC. The + seperation of data and out of band area is not longer + possible. The nand driver code handles this layout and + the remaining free bytes in the oob area are managed by + the autoplacement code. Provide a matching oob-layout + in this case. See rts_from4.c and diskonchip.c for + implementation reference. In those cases we must also + use bad block tables on FLASH, because the ECC layout is + interferring with the bad block marker positions. + See bad block table support for details. + </para> + </sect2> + </sect1> + <sect1 id="Bad_Block_table_support"> + <title>Bad block table support</title> + <para> + Most NAND chips mark the bad blocks at a defined + position in the spare area. Those blocks must + not be erased under any circumstances as the bad + block information would be lost. + It is possible to check the bad block mark each + time when the blocks are accessed by reading the + spare area of the first page in the block. This + is time consuming so a bad block table is used. + </para> + <para> + The nand driver supports various types of bad block + tables. + <itemizedlist> + <listitem><para>Per device</para><para> + The bad block table contains all bad block information + of the device which can consist of multiple chips. + </para> </listitem> + <listitem><para>Per chip</para><para> + A bad block table is used per chip and contains the + bad block information for this particular chip. + </para> </listitem> + <listitem><para>Fixed offset</para><para> + The bad block table is located at a fixed offset + in the chip (device). This applies to various + DiskOnChip devices. + </para> </listitem> + <listitem><para>Automatic placed</para><para> + The bad block table is automatically placed and + detected either at the end or at the beginning + of a chip (device) + </para> </listitem> + <listitem><para>Mirrored tables</para><para> + The bad block table is mirrored on the chip (device) to + allow updates of the bad block table without data loss. + </para> </listitem> + </itemizedlist> + </para> + <para> + nand_scan() calls the function nand_default_bbt(). + nand_default_bbt() selects appropriate default + bad block table desriptors depending on the chip information + which was retrieved by nand_scan(). + </para> + <para> + The standard policy is scanning the device for bad + blocks and build a ram based bad block table which + allows faster access than always checking the + bad block information on the flash chip itself. + </para> + <sect2 id="Flash_based_tables"> + <title>Flash based tables</title> + <para> + It may be desired or neccecary to keep a bad block table in FLASH. + For AG-AND chips this is mandatory, as they have no factory marked + bad blocks. They have factory marked good blocks. The marker pattern + is erased when the block is erased to be reused. So in case of + powerloss before writing the pattern back to the chip this block + would be lost and added to the bad blocks. Therefor we scan the + chip(s) when we detect them the first time for good blocks and + store this information in a bad block table before erasing any + of the blocks. + </para> + <para> + The blocks in which the tables are stored are procteted against + accidental access by marking them bad in the memory bad block + table. The bad block table managment functions are allowed + to circumvernt this protection. + </para> + <para> + The simplest way to activate the FLASH based bad block table support + is to set the option NAND_USE_FLASH_BBT in the option field of + the nand chip structure before calling nand_scan(). For AG-AND + chips is this done by default. + This activates the default FLASH based bad block table functionality + of the NAND driver. The default bad block table options are + <itemizedlist> + <listitem><para>Store bad block table per chip</para></listitem> + <listitem><para>Use 2 bits per block</para></listitem> + <listitem><para>Automatic placement at the end of the chip</para></listitem> + <listitem><para>Use mirrored tables with version numbers</para></listitem> + <listitem><para>Reserve 4 blocks at the end of the chip</para></listitem> + </itemizedlist> + </para> + </sect2> + <sect2 id="User_defined_tables"> + <title>User defined tables</title> + <para> + User defined tables are created by filling out a + nand_bbt_descr structure and storing the pointer in the + nand_chip structure member bbt_td before calling nand_scan(). + If a mirror table is neccecary a second structure must be + created and a pointer to this structure must be stored + in bbt_md inside the nand_chip structure. If the bbt_md + member is set to NULL then only the main table is used + and no scan for the mirrored table is performed. + </para> + <para> + The most important field in the nand_bbt_descr structure + is the options field. The options define most of the + table properties. Use the predefined constants from + nand.h to define the options. + <itemizedlist> + <listitem><para>Number of bits per block</para> + <para>The supported number of bits is 1, 2, 4, 8.</para></listitem> + <listitem><para>Table per chip</para> + <para>Setting the constant NAND_BBT_PERCHIP selects that + a bad block table is managed for each chip in a chip array. + If this option is not set then a per device bad block table + is used.</para></listitem> + <listitem><para>Table location is absolute</para> + <para>Use the option constant NAND_BBT_ABSPAGE and + define the absolute page number where the bad block + table starts in the field pages. If you have selected bad block + tables per chip and you have a multi chip array then the start page + must be given for each chip in the chip array. Note: there is no scan + for a table ident pattern performed, so the fields + pattern, veroffs, offs, len can be left uninitialized</para></listitem> + <listitem><para>Table location is automatically detected</para> + <para>The table can either be located in the first or the last good + blocks of the chip (device). Set NAND_BBT_LASTBLOCK to place + the bad block table at the end of the chip (device). The + bad block tables are marked and identified by a pattern which + is stored in the spare area of the first page in the block which + holds the bad block table. Store a pointer to the pattern + in the pattern field. Further the length of the pattern has to be + stored in len and the offset in the spare area must be given + in the offs member of the nand_bbt_descr stucture. For mirrored + bad block tables different patterns are mandatory.</para></listitem> + <listitem><para>Table creation</para> + <para>Set the option NAND_BBT_CREATE to enable the table creation + if no table can be found during the scan. Usually this is done only + once if a new chip is found. </para></listitem> + <listitem><para>Table write support</para> + <para>Set the option NAND_BBT_WRITE to enable the table write support. + This allows the update of the bad block table(s) in case a block has + to be marked bad due to wear. The MTD interface function block_markbad + is calling the update function of the bad block table. If the write + support is enabled then the table is updated on FLASH.</para> + <para> + Note: Write support should only be enabled for mirrored tables with + version control. + </para></listitem> + <listitem><para>Table version control</para> + <para>Set the option NAND_BBT_VERSION to enable the table version control. + It's highly recommended to enable this for mirrored tables with write + support. It makes sure that the risk of loosing the bad block + table information is reduced to the loss of the information about the + one worn out block which should be marked bad. The version is stored in + 4 consecutive bytes in the spare area of the device. The position of + the version number is defined by the member veroffs in the bad block table + descriptor.</para></listitem> + <listitem><para>Save block contents on write</para> + <para> + In case that the block which holds the bad block table does contain + other useful information, set the option NAND_BBT_SAVECONTENT. When + the bad block table is written then the whole block is read the bad + block table is updated and the block is erased and everything is + written back. If this option is not set only the bad block table + is written and everything else in the block is ignored and erased. + </para></listitem> + <listitem><para>Number of reserved blocks</para> + <para> + For automatic placement some blocks must be reserved for + bad block table storage. The number of reserved blocks is defined + in the maxblocks member of the babd block table description structure. + Reserving 4 blocks for mirrored tables should be a reasonable number. + This also limits the number of blocks which are scanned for the bad + block table ident pattern. + </para></listitem> + </itemizedlist> + </para> + </sect2> + </sect1> + <sect1 id="Spare_area_placement"> + <title>Spare area (auto)placement</title> + <para> + The nand driver implements different possibilities for + placement of filesystem data in the spare area, + <itemizedlist> + <listitem><para>Placement defined by fs driver</para></listitem> + <listitem><para>Automatic placement</para></listitem> + </itemizedlist> + The default placement function is automatic placement. The + nand driver has built in default placement schemes for the + various chiptypes. If due to hardware ECC functionality the + default placement does not fit then the board driver can + provide a own placement scheme. + </para> + <para> + File system drivers can provide a own placement scheme which + is used instead of the default placement scheme. + </para> + <para> + Placement schemes are defined by a nand_oobinfo structure + <programlisting> +struct nand_oobinfo { + int useecc; + int eccbytes; + int eccpos[24]; + int oobfree[8][2]; +}; + </programlisting> + <itemizedlist> + <listitem><para>useecc</para><para> + The useecc member controls the ecc and placement function. The header + file include/mtd/mtd-abi.h contains constants to select ecc and + placement. MTD_NANDECC_OFF switches off the ecc complete. This is + not recommended and available for testing and diagnosis only. + MTD_NANDECC_PLACE selects caller defined placement, MTD_NANDECC_AUTOPLACE + selects automatic placement. + </para></listitem> + <listitem><para>eccbytes</para><para> + The eccbytes member defines the number of ecc bytes per page. + </para></listitem> + <listitem><para>eccpos</para><para> + The eccpos array holds the byte offsets in the spare area where + the ecc codes are placed. + </para></listitem> + <listitem><para>oobfree</para><para> + The oobfree array defines the areas in the spare area which can be + used for automatic placement. The information is given in the format + {offset, size}. offset defines the start of the usable area, size the + length in bytes. More than one area can be defined. The list is terminated + by an {0, 0} entry. + </para></listitem> + </itemizedlist> + </para> + <sect2 id="Placement_defined_by_fs_driver"> + <title>Placement defined by fs driver</title> + <para> + The calling function provides a pointer to a nand_oobinfo + structure which defines the ecc placement. For writes the + caller must provide a spare area buffer along with the + data buffer. The spare area buffer size is (number of pages) * + (size of spare area). For reads the buffer size is + (number of pages) * ((size of spare area) + (number of ecc + steps per page) * sizeof (int)). The driver stores the + result of the ecc check for each tuple in the spare buffer. + The storage sequence is + </para> + <para> + <spare data page 0><ecc result 0>...<ecc result n> + </para> + <para> + ... + </para> + <para> + <spare data page n><ecc result 0>...<ecc result n> + </para> + <para> + This is a legacy mode used by YAFFS1. + </para> + <para> + If the spare area buffer is NULL then only the ECC placement is + done according to the given scheme in the nand_oobinfo structure. + </para> + </sect2> + <sect2 id="Automatic_placement"> + <title>Automatic placement</title> + <para> + Automatic placement uses the built in defaults to place the + ecc bytes in the spare area. If filesystem data have to be stored / + read into the spare area then the calling function must provide a + buffer. The buffer size per page is determined by the oobfree array in + the nand_oobinfo structure. + </para> + <para> + If the spare area buffer is NULL then only the ECC placement is + done according to the default builtin scheme. + </para> + </sect2> + <sect2 id="User_space_placement_selection"> + <title>User space placement selection</title> + <para> + All non ecc functions like mtd->read and mtd->write use an internal + structure, which can be set by an ioctl. This structure is preset + to the autoplacement default. + <programlisting> + ioctl (fd, MEMSETOOBSEL, oobsel); + </programlisting> + oobsel is a pointer to a user supplied structure of type + nand_oobconfig. The contents of this structure must match the + criteria of the filesystem, which will be used. See an example in utils/nandwrite.c. + </para> + </sect2> + </sect1> + <sect1 id="Spare_area_autoplacement_default"> + <title>Spare area autoplacement default schemes</title> + <sect2 id="pagesize_256"> + <title>256 byte pagesize</title> +<informaltable><tgroup cols="3"><tbody> +<row> +<entry>Offset</entry> +<entry>Content</entry> +<entry>Comment</entry> +</row> +<row> +<entry>0x00</entry> +<entry>ECC byte 0</entry> +<entry>Error correction code byte 0</entry> +</row> +<row> +<entry>0x01</entry> +<entry>ECC byte 1</entry> +<entry>Error correction code byte 1</entry> +</row> +<row> +<entry>0x02</entry> +<entry>ECC byte 2</entry> +<entry>Error correction code byte 2</entry> +</row> +<row> +<entry>0x03</entry> +<entry>Autoplace 0</entry> +<entry></entry> +</row> +<row> +<entry>0x04</entry> +<entry>Autoplace 1</entry> +<entry></entry> +</row> +<row> +<entry>0x05</entry> +<entry>Bad block marker</entry> +<entry>If any bit in this byte is zero, then this block is bad. +This applies only to the first page in a block. In the remaining +pages this byte is reserved</entry> +</row> +<row> +<entry>0x06</entry> +<entry>Autoplace 2</entry> +<entry></entry> +</row> +<row> +<entry>0x07</entry> +<entry>Autoplace 3</entry> +<entry></entry> +</row> +</tbody></tgroup></informaltable> + </sect2> + <sect2 id="pagesize_512"> + <title>512 byte pagesize</title> +<informaltable><tgroup cols="3"><tbody> +<row> +<entry>Offset</entry> +<entry>Content</entry> +<entry>Comment</entry> +</row> +<row> +<entry>0x00</entry> +<entry>ECC byte 0</entry> +<entry>Error correction code byte 0 of the lower 256 Byte data in +this page</entry> +</row> +<row> +<entry>0x01</entry> +<entry>ECC byte 1</entry> +<entry>Error correction code byte 1 of the lower 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x02</entry> +<entry>ECC byte 2</entry> +<entry>Error correction code byte 2 of the lower 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x03</entry> +<entry>ECC byte 3</entry> +<entry>Error correction code byte 0 of the upper 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x04</entry> +<entry>reserved</entry> +<entry>reserved</entry> +</row> +<row> +<entry>0x05</entry> +<entry>Bad block marker</entry> +<entry>If any bit in this byte is zero, then this block is bad. +This applies only to the first page in a block. In the remaining +pages this byte is reserved</entry> +</row> +<row> +<entry>0x06</entry> +<entry>ECC byte 4</entry> +<entry>Error correction code byte 1 of the upper 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x07</entry> +<entry>ECC byte 5</entry> +<entry>Error correction code byte 2 of the upper 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x08 - 0x0F</entry> +<entry>Autoplace 0 - 7</entry> +<entry></entry> +</row> +</tbody></tgroup></informaltable> + </sect2> + <sect2 id="pagesize_2048"> + <title>2048 byte pagesize</title> +<informaltable><tgroup cols="3"><tbody> +<row> +<entry>Offset</entry> +<entry>Content</entry> +<entry>Comment</entry> +</row> +<row> +<entry>0x00</entry> +<entry>Bad block marker</entry> +<entry>If any bit in this byte is zero, then this block is bad. +This applies only to the first page in a block. In the remaining +pages this byte is reserved</entry> +</row> +<row> +<entry>0x01</entry> +<entry>Reserved</entry> +<entry>Reserved</entry> +</row> +<row> +<entry>0x02-0x27</entry> +<entry>Autoplace 0 - 37</entry> +<entry></entry> +</row> +<row> +<entry>0x28</entry> +<entry>ECC byte 0</entry> +<entry>Error correction code byte 0 of the first 256 Byte data in +this page</entry> +</row> +<row> +<entry>0x29</entry> +<entry>ECC byte 1</entry> +<entry>Error correction code byte 1 of the first 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x2A</entry> +<entry>ECC byte 2</entry> +<entry>Error correction code byte 2 of the first 256 Bytes data in +this page</entry> +</row> +<row> +<entry>0x2B</entry> +<entry>ECC byte 3</entry> +<entry>Error correction code byte 0 of the second 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x2C</entry> +<entry>ECC byte 4</entry> +<entry>Error correction code byte 1 of the second 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x2D</entry> +<entry>ECC byte 5</entry> +<entry>Error correction code byte 2 of the second 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x2E</entry> +<entry>ECC byte 6</entry> +<entry>Error correction code byte 0 of the third 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x2F</entry> +<entry>ECC byte 7</entry> +<entry>Error correction code byte 1 of the third 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x30</entry> +<entry>ECC byte 8</entry> +<entry>Error correction code byte 2 of the third 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x31</entry> +<entry>ECC byte 9</entry> +<entry>Error correction code byte 0 of the fourth 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x32</entry> +<entry>ECC byte 10</entry> +<entry>Error correction code byte 1 of the fourth 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x33</entry> +<entry>ECC byte 11</entry> +<entry>Error correction code byte 2 of the fourth 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x34</entry> +<entry>ECC byte 12</entry> +<entry>Error correction code byte 0 of the fifth 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x35</entry> +<entry>ECC byte 13</entry> +<entry>Error correction code byte 1 of the fifth 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x36</entry> +<entry>ECC byte 14</entry> +<entry>Error correction code byte 2 of the fifth 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x37</entry> +<entry>ECC byte 15</entry> +<entry>Error correction code byte 0 of the sixt 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x38</entry> +<entry>ECC byte 16</entry> +<entry>Error correction code byte 1 of the sixt 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x39</entry> +<entry>ECC byte 17</entry> +<entry>Error correction code byte 2 of the sixt 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x3A</entry> +<entry>ECC byte 18</entry> +<entry>Error correction code byte 0 of the seventh 256 Bytes of +data in this page</entry> +</row> +<row> +<entry>0x3B</entry> +<entry>ECC byte 19</entry> +<entry>Error correction code byte 1 of the seventh 256 Bytes of +data in this page</entry> +</row> +<row> +<entry>0x3C</entry> +<entry>ECC byte 20</entry> +<entry>Error correction code byte 2 of the seventh 256 Bytes of +data in this page</entry> +</row> +<row> +<entry>0x3D</entry> +<entry>ECC byte 21</entry> +<entry>Error correction code byte 0 of the eigth 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x3E</entry> +<entry>ECC byte 22</entry> +<entry>Error correction code byte 1 of the eigth 256 Bytes of data +in this page</entry> +</row> +<row> +<entry>0x3F</entry> +<entry>ECC byte 23</entry> +<entry>Error correction code byte 2 of the eigth 256 Bytes of data +in this page</entry> +</row> +</tbody></tgroup></informaltable> + </sect2> + </sect1> + </chapter> + + <chapter id="filesystems"> + <title>Filesystem support</title> + <para> + The NAND driver provides all neccecary functions for a + filesystem via the MTD interface. + </para> + <para> + Filesystems must be aware of the NAND pecularities and + restrictions. One major restrictions of NAND Flash is, that you cannot + write as often as you want to a page. The consecutive writes to a page, + before erasing it again, are restricted to 1-3 writes, depending on the + manufacturers specifications. This applies similar to the spare area. + </para> + <para> + Therefor NAND aware filesystems must either write in page size chunks + or hold a writebuffer to collect smaller writes until they sum up to + pagesize. Available NAND aware filesystems: JFFS2, YAFFS. + </para> + <para> + The spare area usage to store filesystem data is controlled by + the spare area placement functionality which is described in one + of the earlier chapters. + </para> + </chapter> + <chapter id="tools"> + <title>Tools</title> + <para> + The MTD project provides a couple of helpful tools to handle NAND Flash. + <itemizedlist> + <listitem><para>flasherase, flasheraseall: Erase and format FLASH partitions</para></listitem> + <listitem><para>nandwrite: write filesystem images to NAND FLASH</para></listitem> + <listitem><para>nanddump: dump the contents of a NAND FLASH partitions</para></listitem> + </itemizedlist> + </para> + <para> + These tools are aware of the NAND restrictions. Please use those tools + instead of complaining about errors which are caused by non NAND aware + access methods. + </para> + </chapter> + + <chapter id="defines"> + <title>Constants</title> + <para> + This chapter describes the constants which might be relevant for a driver developer. + </para> + <sect1 id="Chip_option_constants"> + <title>Chip option constants</title> + <sect2 id="Constants_for_chip_id_table"> + <title>Constants for chip id table</title> + <para> + These constants are defined in nand.h. They are ored together to describe + the chip functionality. + <programlisting> +/* Chip can not auto increment pages */ +#define NAND_NO_AUTOINCR 0x00000001 +/* Buswitdh is 16 bit */ +#define NAND_BUSWIDTH_16 0x00000002 +/* Device supports partial programming without padding */ +#define NAND_NO_PADDING 0x00000004 +/* Chip has cache program function */ +#define NAND_CACHEPRG 0x00000008 +/* Chip has copy back function */ +#define NAND_COPYBACK 0x00000010 +/* AND Chip which has 4 banks and a confusing page / block + * assignment. See Renesas datasheet for further information */ +#define NAND_IS_AND 0x00000020 +/* Chip has a array of 4 pages which can be read without + * additional ready /busy waits */ +#define NAND_4PAGE_ARRAY 0x00000040 + </programlisting> + </para> + </sect2> + <sect2 id="Constants_for_runtime_options"> + <title>Constants for runtime options</title> + <para> + These constants are defined in nand.h. They are ored together to describe + the functionality. + <programlisting> +/* Use a flash based bad block table. This option is parsed by the + * default bad block table function (nand_default_bbt). */ +#define NAND_USE_FLASH_BBT 0x00010000 +/* The hw ecc generator provides a syndrome instead a ecc value on read + * This can only work if we have the ecc bytes directly behind the + * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */ +#define NAND_HWECC_SYNDROME 0x00020000 + </programlisting> + </para> + </sect2> + </sect1> + + <sect1 id="EEC_selection_constants"> + <title>ECC selection constants</title> + <para> + Use these constants to select the ECC algorithm. + <programlisting> +/* No ECC. Usage is not recommended ! */ +#define NAND_ECC_NONE 0 +/* Software ECC 3 byte ECC per 256 Byte data */ +#define NAND_ECC_SOFT 1 +/* Hardware ECC 3 byte ECC per 256 Byte data */ +#define NAND_ECC_HW3_256 2 +/* Hardware ECC 3 byte ECC per 512 Byte data */ +#define NAND_ECC_HW3_512 3 +/* Hardware ECC 6 byte ECC per 512 Byte data */ +#define NAND_ECC_HW6_512 4 +/* Hardware ECC 6 byte ECC per 512 Byte data */ +#define NAND_ECC_HW8_512 6 + </programlisting> + </para> + </sect1> + + <sect1 id="Hardware_control_related_constants"> + <title>Hardware control related constants</title> + <para> + These constants describe the requested hardware access function when + the boardspecific hardware control function is called + <programlisting> +/* Select the chip by setting nCE to low */ +#define NAND_CTL_SETNCE 1 +/* Deselect the chip by setting nCE to high */ +#define NAND_CTL_CLRNCE 2 +/* Select the command latch by setting CLE to high */ +#define NAND_CTL_SETCLE 3 +/* Deselect the command latch by setting CLE to low */ +#define NAND_CTL_CLRCLE 4 +/* Select the address latch by setting ALE to high */ +#define NAND_CTL_SETALE 5 +/* Deselect the address latch by setting ALE to low */ +#define NAND_CTL_CLRALE 6 +/* Set write protection by setting WP to high. Not used! */ +#define NAND_CTL_SETWP 7 +/* Clear write protection by setting WP to low. Not used! */ +#define NAND_CTL_CLRWP 8 + </programlisting> + </para> + </sect1> + + <sect1 id="Bad_block_table_constants"> + <title>Bad block table related constants</title> + <para> + These constants describe the options used for bad block + table descriptors. + <programlisting> +/* Options for the bad block table descriptors */ + +/* The number of bits used per block in the bbt on the device */ +#define NAND_BBT_NRBITS_MSK 0x0000000F +#define NAND_BBT_1BIT 0x00000001 +#define NAND_BBT_2BIT 0x00000002 +#define NAND_BBT_4BIT 0x00000004 +#define NAND_BBT_8BIT 0x00000008 +/* The bad block table is in the last good block of the device */ +#define NAND_BBT_LASTBLOCK 0x00000010 +/* The bbt is at the given page, else we must scan for the bbt */ +#define NAND_BBT_ABSPAGE 0x00000020 +/* The bbt is at the given page, else we must scan for the bbt */ +#define NAND_BBT_SEARCH 0x00000040 +/* bbt is stored per chip on multichip devices */ +#define NAND_BBT_PERCHIP 0x00000080 +/* bbt has a version counter at offset veroffs */ +#define NAND_BBT_VERSION 0x00000100 +/* Create a bbt if none axists */ +#define NAND_BBT_CREATE 0x00000200 +/* Search good / bad pattern through all pages of a block */ +#define NAND_BBT_SCANALLPAGES 0x00000400 +/* Scan block empty during good / bad block scan */ +#define NAND_BBT_SCANEMPTY 0x00000800 +/* Write bbt if neccecary */ +#define NAND_BBT_WRITE 0x00001000 +/* Read and write back block contents when writing bbt */ +#define NAND_BBT_SAVECONTENT 0x00002000 + </programlisting> + </para> + </sect1> + + </chapter> + + <chapter id="structs"> + <title>Structures</title> + <para> + This chapter contains the autogenerated documentation of the structures which are + used in the NAND driver and might be relevant for a driver developer. Each + struct member has a short description which is marked with an [XXX] identifier. + See the chapter "Documentation hints" for an explanation. + </para> +!Iinclude/linux/mtd/nand.h + </chapter> + + <chapter id="pubfunctions"> + <title>Public Functions Provided</title> + <para> + This chapter contains the autogenerated documentation of the NAND kernel API functions + which are exported. Each function has a short description which is marked with an [XXX] identifier. + See the chapter "Documentation hints" for an explanation. + </para> +!Edrivers/mtd/nand/nand_base.c +!Edrivers/mtd/nand/nand_bbt.c +!Edrivers/mtd/nand/nand_ecc.c + </chapter> + + <chapter id="intfunctions"> + <title>Internal Functions Provided</title> + <para> + This chapter contains the autogenerated documentation of the NAND driver internal functions. + Each function has a short description which is marked with an [XXX] identifier. + See the chapter "Documentation hints" for an explanation. + The functions marked with [DEFAULT] might be relevant for a board driver developer. + </para> +!Idrivers/mtd/nand/nand_base.c +!Idrivers/mtd/nand/nand_bbt.c +<!-- No internal functions for kernel-doc: +X!Idrivers/mtd/nand/nand_ecc.c +--> + </chapter> + + <chapter id="credits"> + <title>Credits</title> + <para> + The following people have contributed to the NAND driver: + <orderedlist> + <listitem><para>Steven J. Hill<email>sjhill@realitydiluted.com</email></para></listitem> + <listitem><para>David Woodhouse<email>dwmw2@infradead.org</email></para></listitem> + <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem> + </orderedlist> + A lot of users have provided bugfixes, improvements and helping hands for testing. + Thanks a lot. + </para> + <para> + The following people have contributed to this document: + <orderedlist> + <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem> + </orderedlist> + </para> + </chapter> +</book> diff --git a/Documentation/DocBook/networking.tmpl b/Documentation/DocBook/networking.tmpl new file mode 100644 index 0000000..f24f9e8 --- /dev/null +++ b/Documentation/DocBook/networking.tmpl @@ -0,0 +1,106 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="LinuxNetworking"> + <bookinfo> + <title>Linux Networking and Network Devices APIs</title> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="netcore"> + <title>Linux Networking</title> + <sect1><title>Networking Base Types</title> +!Iinclude/linux/net.h + </sect1> + <sect1><title>Socket Buffer Functions</title> +!Iinclude/linux/skbuff.h +!Iinclude/net/sock.h +!Enet/socket.c +!Enet/core/skbuff.c +!Enet/core/sock.c +!Enet/core/datagram.c +!Enet/core/stream.c + </sect1> + <sect1><title>Socket Filter</title> +!Enet/core/filter.c + </sect1> + <sect1><title>Generic Network Statistics</title> +!Iinclude/linux/gen_stats.h +!Enet/core/gen_stats.c +!Enet/core/gen_estimator.c + </sect1> + <sect1><title>SUN RPC subsystem</title> +<!-- The !D functionality is not perfect, garbage has to be protected by comments +!Dnet/sunrpc/sunrpc_syms.c +--> +!Enet/sunrpc/xdr.c +!Enet/sunrpc/svc_xprt.c +!Enet/sunrpc/xprt.c +!Enet/sunrpc/sched.c +!Enet/sunrpc/socklib.c +!Enet/sunrpc/stats.c +!Enet/sunrpc/rpc_pipe.c +!Enet/sunrpc/rpcb_clnt.c +!Enet/sunrpc/clnt.c + </sect1> + </chapter> + + <chapter id="netdev"> + <title>Network device support</title> + <sect1><title>Driver Support</title> +!Enet/core/dev.c +!Enet/ethernet/eth.c +!Enet/sched/sch_generic.c +!Iinclude/linux/etherdevice.h +!Iinclude/linux/netdevice.h + </sect1> + <sect1><title>PHY Support</title> +!Edrivers/net/phy/phy.c +!Idrivers/net/phy/phy.c +!Edrivers/net/phy/phy_device.c +!Idrivers/net/phy/phy_device.c +!Edrivers/net/phy/mdio_bus.c +!Idrivers/net/phy/mdio_bus.c + </sect1> +<!-- FIXME: Removed for now since no structured comments in source + <sect1><title>Wireless</title> +X!Enet/core/wireless.c + </sect1> +--> + <sect1><title>Synchronous PPP</title> +!Edrivers/net/wan/syncppp.c + </sect1> + </chapter> + +</book> diff --git a/Documentation/DocBook/procfs-guide.tmpl b/Documentation/DocBook/procfs-guide.tmpl new file mode 100644 index 0000000..9eba4b7 --- /dev/null +++ b/Documentation/DocBook/procfs-guide.tmpl @@ -0,0 +1,626 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [ +<!ENTITY procfsexample SYSTEM "procfs_example.xml"> +]> + +<book id="LKProcfsGuide"> + <bookinfo> + <title>Linux Kernel Procfs Guide</title> + + <authorgroup> + <author> + <firstname>Erik</firstname> + <othername>(J.A.K.)</othername> + <surname>Mouw</surname> + <affiliation> + <address> + <email>mouw@nl.linux.org</email> + </address> + </affiliation> + </author> + <othercredit> + <contrib> + This software and documentation were written while working on the + LART computing board + (<ulink url="http://www.lartmaker.nl/">http://www.lartmaker.nl/</ulink>), + which was sponsored by the Delt University of Technology projects + Mobile Multi-media Communications and Ubiquitous Communications. + </contrib> + </othercredit> + </authorgroup> + + <revhistory> + <revision> + <revnumber>1.0</revnumber> + <date>May 30, 2001</date> + <revremark>Initial revision posted to linux-kernel</revremark> + </revision> + <revision> + <revnumber>1.1</revnumber> + <date>June 3, 2001</date> + <revremark>Revised after comments from linux-kernel</revremark> + </revision> + </revhistory> + + <copyright> + <year>2001</year> + <holder>Erik Mouw</holder> + </copyright> + + + <legalnotice> + <para> + This documentation is free software; you can redistribute it + and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This documentation is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + PURPOSE. See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + + + + + <toc> + </toc> + + + + + <preface id="Preface"> + <title>Preface</title> + + <para> + This guide describes the use of the procfs file system from + within the Linux kernel. The idea to write this guide came up on + the #kernelnewbies IRC channel (see <ulink + url="http://www.kernelnewbies.org/">http://www.kernelnewbies.org/</ulink>), + when Jeff Garzik explained the use of procfs and forwarded me a + message Alexander Viro wrote to the linux-kernel mailing list. I + agreed to write it up nicely, so here it is. + </para> + + <para> + I'd like to thank Jeff Garzik + <email>jgarzik@pobox.com</email> and Alexander Viro + <email>viro@parcelfarce.linux.theplanet.co.uk</email> for their input, + Tim Waugh <email>twaugh@redhat.com</email> for his <ulink + url="http://people.redhat.com/twaugh/docbook/selfdocbook/">Selfdocbook</ulink>, + and Marc Joosen <email>marcj@historia.et.tudelft.nl</email> for + proofreading. + </para> + + <para> + Erik + </para> + </preface> + + + + + <chapter id="intro"> + <title>Introduction</title> + + <para> + The <filename class="directory">/proc</filename> file system + (procfs) is a special file system in the linux kernel. It's a + virtual file system: it is not associated with a block device + but exists only in memory. The files in the procfs are there to + allow userland programs access to certain information from the + kernel (like process information in <filename + class="directory">/proc/[0-9]+/</filename>), but also for debug + purposes (like <filename>/proc/ksyms</filename>). + </para> + + <para> + This guide describes the use of the procfs file system from + within the Linux kernel. It starts by introducing all relevant + functions to manage the files within the file system. After that + it shows how to communicate with userland, and some tips and + tricks will be pointed out. Finally a complete example will be + shown. + </para> + + <para> + Note that the files in <filename + class="directory">/proc/sys</filename> are sysctl files: they + don't belong to procfs and are governed by a completely + different API described in the Kernel API book. + </para> + </chapter> + + + + + <chapter id="managing"> + <title>Managing procfs entries</title> + + <para> + This chapter describes the functions that various kernel + components use to populate the procfs with files, symlinks, + device nodes, and directories. + </para> + + <para> + A minor note before we start: if you want to use any of the + procfs functions, be sure to include the correct header file! + This should be one of the first lines in your code: + </para> + + <programlisting> +#include <linux/proc_fs.h> + </programlisting> + + + + + <sect1 id="regularfile"> + <title>Creating a regular file</title> + + <funcsynopsis> + <funcprototype> + <funcdef>struct proc_dir_entry* <function>create_proc_entry</function></funcdef> + <paramdef>const char* <parameter>name</parameter></paramdef> + <paramdef>mode_t <parameter>mode</parameter></paramdef> + <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef> + </funcprototype> + </funcsynopsis> + + <para> + This function creates a regular file with the name + <parameter>name</parameter>, file mode + <parameter>mode</parameter> in the directory + <parameter>parent</parameter>. To create a file in the root of + the procfs, use <constant>NULL</constant> as + <parameter>parent</parameter> parameter. When successful, the + function will return a pointer to the freshly created + <structname>struct proc_dir_entry</structname>; otherwise it + will return <constant>NULL</constant>. <xref + linkend="userland"/> describes how to do something useful with + regular files. + </para> + + <para> + Note that it is specifically supported that you can pass a + path that spans multiple directories. For example + <function>create_proc_entry</function>(<parameter>"drivers/via0/info"</parameter>) + will create the <filename class="directory">via0</filename> + directory if necessary, with standard + <constant>0755</constant> permissions. + </para> + + <para> + If you only want to be able to read the file, the function + <function>create_proc_read_entry</function> described in <xref + linkend="convenience"/> may be used to create and initialise + the procfs entry in one single call. + </para> + </sect1> + + + + + <sect1 id="Creating_a_symlink"> + <title>Creating a symlink</title> + + <funcsynopsis> + <funcprototype> + <funcdef>struct proc_dir_entry* + <function>proc_symlink</function></funcdef> <paramdef>const + char* <parameter>name</parameter></paramdef> + <paramdef>struct proc_dir_entry* + <parameter>parent</parameter></paramdef> <paramdef>const + char* <parameter>dest</parameter></paramdef> + </funcprototype> + </funcsynopsis> + + <para> + This creates a symlink in the procfs directory + <parameter>parent</parameter> that points from + <parameter>name</parameter> to + <parameter>dest</parameter>. This translates in userland to + <literal>ln -s</literal> <parameter>dest</parameter> + <parameter>name</parameter>. + </para> + </sect1> + + <sect1 id="Creating_a_directory"> + <title>Creating a directory</title> + + <funcsynopsis> + <funcprototype> + <funcdef>struct proc_dir_entry* <function>proc_mkdir</function></funcdef> + <paramdef>const char* <parameter>name</parameter></paramdef> + <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef> + </funcprototype> + </funcsynopsis> + + <para> + Create a directory <parameter>name</parameter> in the procfs + directory <parameter>parent</parameter>. + </para> + </sect1> + + + + + <sect1 id="Removing_an_entry"> + <title>Removing an entry</title> + + <funcsynopsis> + <funcprototype> + <funcdef>void <function>remove_proc_entry</function></funcdef> + <paramdef>const char* <parameter>name</parameter></paramdef> + <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef> + </funcprototype> + </funcsynopsis> + + <para> + Removes the entry <parameter>name</parameter> in the directory + <parameter>parent</parameter> from the procfs. Entries are + removed by their <emphasis>name</emphasis>, not by the + <structname>struct proc_dir_entry</structname> returned by the + various create functions. Note that this function doesn't + recursively remove entries. + </para> + + <para> + Be sure to free the <structfield>data</structfield> entry from + the <structname>struct proc_dir_entry</structname> before + <function>remove_proc_entry</function> is called (that is: if + there was some <structfield>data</structfield> allocated, of + course). See <xref linkend="usingdata"/> for more information + on using the <structfield>data</structfield> entry. + </para> + </sect1> + </chapter> + + + + + <chapter id="userland"> + <title>Communicating with userland</title> + + <para> + Instead of reading (or writing) information directly from + kernel memory, procfs works with <emphasis>call back + functions</emphasis> for files: functions that are called when + a specific file is being read or written. Such functions have + to be initialised after the procfs file is created by setting + the <structfield>read_proc</structfield> and/or + <structfield>write_proc</structfield> fields in the + <structname>struct proc_dir_entry*</structname> that the + function <function>create_proc_entry</function> returned: + </para> + + <programlisting> +struct proc_dir_entry* entry; + +entry->read_proc = read_proc_foo; +entry->write_proc = write_proc_foo; + </programlisting> + + <para> + If you only want to use a the + <structfield>read_proc</structfield>, the function + <function>create_proc_read_entry</function> described in <xref + linkend="convenience"/> may be used to create and initialise the + procfs entry in one single call. + </para> + + + + <sect1 id="Reading_data"> + <title>Reading data</title> + + <para> + The read function is a call back function that allows userland + processes to read data from the kernel. The read function + should have the following format: + </para> + + <funcsynopsis> + <funcprototype> + <funcdef>int <function>read_func</function></funcdef> + <paramdef>char* <parameter>buffer</parameter></paramdef> + <paramdef>char** <parameter>start</parameter></paramdef> + <paramdef>off_t <parameter>off</parameter></paramdef> + <paramdef>int <parameter>count</parameter></paramdef> + <paramdef>int* <parameter>peof</parameter></paramdef> + <paramdef>void* <parameter>data</parameter></paramdef> + </funcprototype> + </funcsynopsis> + + <para> + The read function should write its information into the + <parameter>buffer</parameter>, which will be exactly + <literal>PAGE_SIZE</literal> bytes long. + </para> + + <para> + The parameter + <parameter>peof</parameter> should be used to signal that the + end of the file has been reached by writing + <literal>1</literal> to the memory location + <parameter>peof</parameter> points to. + </para> + + <para> + The <parameter>data</parameter> + parameter can be used to create a single call back function for + several files, see <xref linkend="usingdata"/>. + </para> + + <para> + The rest of the parameters and the return value are described + by a comment in <filename>fs/proc/generic.c</filename> as follows: + </para> + + <blockquote> + <para> + You have three ways to return data: + </para> + <orderedlist> + <listitem> + <para> + Leave <literal>*start = NULL</literal>. (This is the default.) + Put the data of the requested offset at that + offset within the buffer. Return the number (<literal>n</literal>) + of bytes there are from the beginning of the + buffer up to the last byte of data. If the + number of supplied bytes (<literal>= n - offset</literal>) is + greater than zero and you didn't signal eof + and the reader is prepared to take more data + you will be called again with the requested + offset advanced by the number of bytes + absorbed. This interface is useful for files + no larger than the buffer. + </para> + </listitem> + <listitem> + <para> + Set <literal>*start</literal> to an unsigned long value less than + the buffer address but greater than zero. + Put the data of the requested offset at the + beginning of the buffer. Return the number of + bytes of data placed there. If this number is + greater than zero and you didn't signal eof + and the reader is prepared to take more data + you will be called again with the requested + offset advanced by <literal>*start</literal>. This interface is + useful when you have a large file consisting + of a series of blocks which you want to count + and return as wholes. + (Hack by Paul.Russell@rustcorp.com.au) + </para> + </listitem> + <listitem> + <para> + Set <literal>*start</literal> to an address within the buffer. + Put the data of the requested offset at <literal>*start</literal>. + Return the number of bytes of data placed there. + If this number is greater than zero and you + didn't signal eof and the reader is prepared to + take more data you will be called again with the + requested offset advanced by the number of bytes + absorbed. + </para> + </listitem> + </orderedlist> + </blockquote> + + <para> + <xref linkend="example"/> shows how to use a read call back + function. + </para> + </sect1> + + + + + <sect1 id="Writing_data"> + <title>Writing data</title> + + <para> + The write call back function allows a userland process to write + data to the kernel, so it has some kind of control over the + kernel. The write function should have the following format: + </para> + + <funcsynopsis> + <funcprototype> + <funcdef>int <function>write_func</function></funcdef> + <paramdef>struct file* <parameter>file</parameter></paramdef> + <paramdef>const char* <parameter>buffer</parameter></paramdef> + <paramdef>unsigned long <parameter>count</parameter></paramdef> + <paramdef>void* <parameter>data</parameter></paramdef> + </funcprototype> + </funcsynopsis> + + <para> + The write function should read <parameter>count</parameter> + bytes at maximum from the <parameter>buffer</parameter>. Note + that the <parameter>buffer</parameter> doesn't live in the + kernel's memory space, so it should first be copied to kernel + space with <function>copy_from_user</function>. The + <parameter>file</parameter> parameter is usually + ignored. <xref linkend="usingdata"/> shows how to use the + <parameter>data</parameter> parameter. + </para> + + <para> + Again, <xref linkend="example"/> shows how to use this call back + function. + </para> + </sect1> + + + + + <sect1 id="usingdata"> + <title>A single call back for many files</title> + + <para> + When a large number of almost identical files is used, it's + quite inconvenient to use a separate call back function for + each file. A better approach is to have a single call back + function that distinguishes between the files by using the + <structfield>data</structfield> field in <structname>struct + proc_dir_entry</structname>. First of all, the + <structfield>data</structfield> field has to be initialised: + </para> + + <programlisting> +struct proc_dir_entry* entry; +struct my_file_data *file_data; + +file_data = kmalloc(sizeof(struct my_file_data), GFP_KERNEL); +entry->data = file_data; + </programlisting> + + <para> + The <structfield>data</structfield> field is a <type>void + *</type>, so it can be initialised with anything. + </para> + + <para> + Now that the <structfield>data</structfield> field is set, the + <function>read_proc</function> and + <function>write_proc</function> can use it to distinguish + between files because they get it passed into their + <parameter>data</parameter> parameter: + </para> + + <programlisting> +int foo_read_func(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + if(data == file_data) { + /* special case for this file */ + } else { + /* normal processing */ + } + + return len; +} + </programlisting> + + <para> + Be sure to free the <structfield>data</structfield> data field + when removing the procfs entry. + </para> + </sect1> + </chapter> + + + + + <chapter id="tips"> + <title>Tips and tricks</title> + + + + + <sect1 id="convenience"> + <title>Convenience functions</title> + + <funcsynopsis> + <funcprototype> + <funcdef>struct proc_dir_entry* <function>create_proc_read_entry</function></funcdef> + <paramdef>const char* <parameter>name</parameter></paramdef> + <paramdef>mode_t <parameter>mode</parameter></paramdef> + <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef> + <paramdef>read_proc_t* <parameter>read_proc</parameter></paramdef> + <paramdef>void* <parameter>data</parameter></paramdef> + </funcprototype> + </funcsynopsis> + + <para> + This function creates a regular file in exactly the same way + as <function>create_proc_entry</function> from <xref + linkend="regularfile"/> does, but also allows to set the read + function <parameter>read_proc</parameter> in one call. This + function can set the <parameter>data</parameter> as well, like + explained in <xref linkend="usingdata"/>. + </para> + </sect1> + + + + <sect1 id="Modules"> + <title>Modules</title> + + <para> + If procfs is being used from within a module, be sure to set + the <structfield>owner</structfield> field in the + <structname>struct proc_dir_entry</structname> to + <constant>THIS_MODULE</constant>. + </para> + + <programlisting> +struct proc_dir_entry* entry; + +entry->owner = THIS_MODULE; + </programlisting> + </sect1> + + + + + <sect1 id="Mode_and_ownership"> + <title>Mode and ownership</title> + + <para> + Sometimes it is useful to change the mode and/or ownership of + a procfs entry. Here is an example that shows how to achieve + that: + </para> + + <programlisting> +struct proc_dir_entry* entry; + +entry->mode = S_IWUSR |S_IRUSR | S_IRGRP | S_IROTH; +entry->uid = 0; +entry->gid = 100; + </programlisting> + + </sect1> + </chapter> + + + + + <chapter id="example"> + <title>Example</title> + + <!-- be careful with the example code: it shouldn't be wider than + approx. 60 columns, or otherwise it won't fit properly on a page + --> + +&procfsexample; + + </chapter> +</book> diff --git a/Documentation/DocBook/procfs_example.c b/Documentation/DocBook/procfs_example.c new file mode 100644 index 0000000..8c6396e --- /dev/null +++ b/Documentation/DocBook/procfs_example.c @@ -0,0 +1,210 @@ +/* + * procfs_example.c: an example proc interface + * + * Copyright (C) 2001, Erik Mouw (mouw@nl.linux.org) + * + * This file accompanies the procfs-guide in the Linux kernel + * source. Its main use is to demonstrate the concepts and + * functions described in the guide. + * + * This software has been developed while working on the LART + * computing board (http://www.lartmaker.nl), which was sponsored + * by the Delt University of Technology projects Mobile Multi-media + * Communications and Ubiquitous Communications. + * + * This program is free software; you can redistribute + * it and/or modify it under the terms of the GNU General + * Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, + * Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/jiffies.h> +#include <asm/uaccess.h> + + +#define MODULE_VERS "1.0" +#define MODULE_NAME "procfs_example" + +#define FOOBAR_LEN 8 + +struct fb_data_t { + char name[FOOBAR_LEN + 1]; + char value[FOOBAR_LEN + 1]; +}; + + +static struct proc_dir_entry *example_dir, *foo_file, + *bar_file, *jiffies_file, *symlink; + + +struct fb_data_t foo_data, bar_data; + + +static int proc_read_jiffies(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + + len = sprintf(page, "jiffies = %ld\n", + jiffies); + + return len; +} + + +static int proc_read_foobar(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + struct fb_data_t *fb_data = (struct fb_data_t *)data; + + /* DON'T DO THAT - buffer overruns are bad */ + len = sprintf(page, "%s = '%s'\n", + fb_data->name, fb_data->value); + + return len; +} + + +static int proc_write_foobar(struct file *file, + const char *buffer, + unsigned long count, + void *data) +{ + int len; + struct fb_data_t *fb_data = (struct fb_data_t *)data; + + if(count > FOOBAR_LEN) + len = FOOBAR_LEN; + else + len = count; + + if(copy_from_user(fb_data->value, buffer, len)) + return -EFAULT; + + fb_data->value[len] = '\0'; + + return len; +} + + +static int __init init_procfs_example(void) +{ + int rv = 0; + + /* create directory */ + example_dir = proc_mkdir(MODULE_NAME, NULL); + if(example_dir == NULL) { + rv = -ENOMEM; + goto out; + } + + example_dir->owner = THIS_MODULE; + + /* create jiffies using convenience function */ + jiffies_file = create_proc_read_entry("jiffies", + 0444, example_dir, + proc_read_jiffies, + NULL); + if(jiffies_file == NULL) { + rv = -ENOMEM; + goto no_jiffies; + } + + jiffies_file->owner = THIS_MODULE; + + /* create foo and bar files using same callback + * functions + */ + foo_file = create_proc_entry("foo", 0644, example_dir); + if(foo_file == NULL) { + rv = -ENOMEM; + goto no_foo; + } + + strcpy(foo_data.name, "foo"); + strcpy(foo_data.value, "foo"); + foo_file->data = &foo_data; + foo_file->read_proc = proc_read_foobar; + foo_file->write_proc = proc_write_foobar; + foo_file->owner = THIS_MODULE; + + bar_file = create_proc_entry("bar", 0644, example_dir); + if(bar_file == NULL) { + rv = -ENOMEM; + goto no_bar; + } + + strcpy(bar_data.name, "bar"); + strcpy(bar_data.value, "bar"); + bar_file->data = &bar_data; + bar_file->read_proc = proc_read_foobar; + bar_file->write_proc = proc_write_foobar; + bar_file->owner = THIS_MODULE; + + /* create symlink */ + symlink = proc_symlink("jiffies_too", example_dir, + "jiffies"); + if(symlink == NULL) { + rv = -ENOMEM; + goto no_symlink; + } + + symlink->owner = THIS_MODULE; + + /* everything OK */ + printk(KERN_INFO "%s %s initialised\n", + MODULE_NAME, MODULE_VERS); + return 0; + +no_symlink: + remove_proc_entry("bar", example_dir); +no_bar: + remove_proc_entry("foo", example_dir); +no_foo: + remove_proc_entry("jiffies", example_dir); +no_jiffies: + remove_proc_entry(MODULE_NAME, NULL); +out: + return rv; +} + + +static void __exit cleanup_procfs_example(void) +{ + remove_proc_entry("jiffies_too", example_dir); + remove_proc_entry("bar", example_dir); + remove_proc_entry("foo", example_dir); + remove_proc_entry("jiffies", example_dir); + remove_proc_entry(MODULE_NAME, NULL); + + printk(KERN_INFO "%s %s removed\n", + MODULE_NAME, MODULE_VERS); +} + + +module_init(init_procfs_example); +module_exit(cleanup_procfs_example); + +MODULE_AUTHOR("Erik Mouw"); +MODULE_DESCRIPTION("procfs examples"); +MODULE_LICENSE("GPL"); diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl new file mode 100644 index 0000000..54eb26b --- /dev/null +++ b/Documentation/DocBook/rapidio.tmpl @@ -0,0 +1,159 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [ + <!ENTITY rapidio SYSTEM "rapidio.xml"> + ]> + +<book id="RapidIO-Guide"> + <bookinfo> + <title>RapidIO Subsystem Guide</title> + + <authorgroup> + <author> + <firstname>Matt</firstname> + <surname>Porter</surname> + <affiliation> + <address> + <email>mporter@kernel.crashing.org</email> + <email>mporter@mvista.com</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2005</year> + <holder>MontaVista Software, Inc.</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License version 2 as published by the Free Software Foundation. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <para> + RapidIO is a high speed switched fabric interconnect with + features aimed at the embedded market. RapidIO provides + support for memory-mapped I/O as well as message-based + transactions over the switched fabric network. RapidIO has + a standardized discovery mechanism not unlike the PCI bus + standard that allows simple detection of devices in a + network. + </para> + <para> + This documentation is provided for developers intending + to support RapidIO on new architectures, write new drivers, + or to understand the subsystem internals. + </para> + </chapter> + + <chapter id="bugs"> + <title>Known Bugs and Limitations</title> + + <sect1 id="known_bugs"> + <title>Bugs</title> + <para>None. ;)</para> + </sect1> + <sect1 id="Limitations"> + <title>Limitations</title> + <para> + <orderedlist> + <listitem><para>Access/management of RapidIO memory regions is not supported</para></listitem> + <listitem><para>Multiple host enumeration is not supported</para></listitem> + </orderedlist> + </para> + </sect1> + </chapter> + + <chapter id="drivers"> + <title>RapidIO driver interface</title> + <para> + Drivers are provided a set of calls in order + to interface with the subsystem to gather info + on devices, request/map memory region resources, + and manage mailboxes/doorbells. + </para> + <sect1 id="Functions"> + <title>Functions</title> +!Iinclude/linux/rio_drv.h +!Edrivers/rapidio/rio-driver.c +!Edrivers/rapidio/rio.c + </sect1> + </chapter> + + <chapter id="internals"> + <title>Internals</title> + + <para> + This chapter contains the autogenerated documentation of the RapidIO + subsystem. + </para> + + <sect1 id="Structures"><title>Structures</title> +!Iinclude/linux/rio.h + </sect1> + <sect1 id="Enumeration_and_Discovery"><title>Enumeration and Discovery</title> +!Idrivers/rapidio/rio-scan.c + </sect1> + <sect1 id="Driver_functionality"><title>Driver functionality</title> +!Idrivers/rapidio/rio.c +!Idrivers/rapidio/rio-access.c + </sect1> + <sect1 id="Device_model_support"><title>Device model support</title> +!Idrivers/rapidio/rio-driver.c + </sect1> + <sect1 id="Sysfs_support"><title>Sysfs support</title> +!Idrivers/rapidio/rio-sysfs.c + </sect1> + <sect1 id="PPC32_support"><title>PPC32 support</title> +!Earch/powerpc/sysdev/fsl_rio.c +!Iarch/powerpc/sysdev/fsl_rio.c + </sect1> + </chapter> + + <chapter id="credits"> + <title>Credits</title> + <para> + The following people have contributed to the RapidIO + subsystem directly or indirectly: + <orderedlist> + <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem> + <listitem><para>Randy Vinson<email>rvinson@mvista.com</email></para></listitem> + <listitem><para>Dan Malek<email>dan@embeddedalley.com</email></para></listitem> + </orderedlist> + </para> + <para> + The following people have contributed to this document: + <orderedlist> + <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem> + </orderedlist> + </para> + </chapter> +</book> diff --git a/Documentation/DocBook/s390-drivers.tmpl b/Documentation/DocBook/s390-drivers.tmpl new file mode 100644 index 0000000..95bfc12 --- /dev/null +++ b/Documentation/DocBook/s390-drivers.tmpl @@ -0,0 +1,161 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="s390drivers"> + <bookinfo> + <title>Writing s390 channel device drivers</title> + + <authorgroup> + <author> + <firstname>Cornelia</firstname> + <surname>Huck</surname> + <affiliation> + <address> + <email>cornelia.huck@de.ibm.com</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2007</year> + <holder>IBM Corp.</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <para> + This document describes the interfaces available for device drivers that + drive s390 based channel attached I/O devices. This includes interfaces for + interaction with the hardware and interfaces for interacting with the + common driver core. Those interfaces are provided by the s390 common I/O + layer. + </para> + <para> + The document assumes a familarity with the technical terms associated + with the s390 channel I/O architecture. For a description of this + architecture, please refer to the "z/Architecture: Principles of + Operation", IBM publication no. SA22-7832. + </para> + <para> + While most I/O devices on a s390 system are typically driven through the + channel I/O mechanism described here, there are various other methods + (like the diag interface). These are out of the scope of this document. + </para> + <para> + Some additional information can also be found in the kernel source + under Documentation/s390/driver-model.txt. + </para> + </chapter> + <chapter id="ccw"> + <title>The ccw bus</title> + <para> + The ccw bus typically contains the majority of devices available to + a s390 system. Named after the channel command word (ccw), the basic + command structure used to address its devices, the ccw bus contains + so-called channel attached devices. They are addressed via I/O + subchannels, visible on the css bus. A device driver for + channel-attached devices, however, will never interact with the + subchannel directly, but only via the I/O device on the ccw bus, + the ccw device. + </para> + <sect1 id="channelIO"> + <title>I/O functions for channel-attached devices</title> + <para> + Some hardware structures have been translated into C structures for use + by the common I/O layer and device drivers. For more information on + the hardware structures represented here, please consult the Principles + of Operation. + </para> +!Iarch/s390/include/asm/cio.h + </sect1> + <sect1 id="ccwdev"> + <title>ccw devices</title> + <para> + Devices that want to initiate channel I/O need to attach to the ccw bus. + Interaction with the driver core is done via the common I/O layer, which + provides the abstractions of ccw devices and ccw device drivers. + </para> + <para> + The functions that initiate or terminate channel I/O all act upon a + ccw device structure. Device drivers must not bypass those functions + or strange side effects may happen. + </para> +!Iarch/s390/include/asm/ccwdev.h +!Edrivers/s390/cio/device.c +!Edrivers/s390/cio/device_ops.c + </sect1> + <sect1 id="cmf"> + <title>The channel-measurement facility</title> + <para> + The channel-measurement facility provides a means to collect + measurement data which is made available by the channel subsystem + for each channel attached device. + </para> +!Iarch/s390/include/asm/cmb.h +!Edrivers/s390/cio/cmf.c + </sect1> + </chapter> + + <chapter id="ccwgroup"> + <title>The ccwgroup bus</title> + <para> + The ccwgroup bus only contains artificial devices, created by the user. + Many networking devices (e.g. qeth) are in fact composed of several + ccw devices (like read, write and data channel for qeth). The + ccwgroup bus provides a mechanism to create a meta-device which + contains those ccw devices as slave devices and can be associated + with the netdevice. + </para> + <sect1 id="ccwgroupdevices"> + <title>ccw group devices</title> +!Iarch/s390/include/asm/ccwgroup.h +!Edrivers/s390/cio/ccwgroup.c + </sect1> + </chapter> + + <chapter id="genericinterfaces"> + <title>Generic interfaces</title> + <para> + Some interfaces are available to other drivers that do not necessarily + have anything to do with the busses described above, but still are + indirectly using basic infrastructure in the common I/O layer. + One example is the support for adapter interrupts. + </para> +!Edrivers/s390/cio/airq.c + </chapter> + +</book> diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl new file mode 100644 index 0000000..10a150a --- /dev/null +++ b/Documentation/DocBook/scsi.tmpl @@ -0,0 +1,409 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="scsimid"> + <bookinfo> + <title>SCSI Interfaces Guide</title> + + <authorgroup> + <author> + <firstname>James</firstname> + <surname>Bottomley</surname> + <affiliation> + <address> + <email>James.Bottomley@hansenpartnership.com</email> + </address> + </affiliation> + </author> + + <author> + <firstname>Rob</firstname> + <surname>Landley</surname> + <affiliation> + <address> + <email>rob@landley.net</email> + </address> + </affiliation> + </author> + + </authorgroup> + + <copyright> + <year>2007</year> + <holder>Linux Foundation</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License version 2. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + + <toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <sect1 id="protocol_vs_bus"> + <title>Protocol vs bus</title> + <para> + Once upon a time, the Small Computer Systems Interface defined both + a parallel I/O bus and a data protocol to connect a wide variety of + peripherals (disk drives, tape drives, modems, printers, scanners, + optical drives, test equipment, and medical devices) to a host + computer. + </para> + <para> + Although the old parallel (fast/wide/ultra) SCSI bus has largely + fallen out of use, the SCSI command set is more widely used than ever + to communicate with devices over a number of different busses. + </para> + <para> + The <ulink url='http://www.t10.org/scsi-3.htm'>SCSI protocol</ulink> + is a big-endian peer-to-peer packet based protocol. SCSI commands + are 6, 10, 12, or 16 bytes long, often followed by an associated data + payload. + </para> + <para> + SCSI commands can be transported over just about any kind of bus, and + are the default protocol for storage devices attached to USB, SATA, + SAS, Fibre Channel, FireWire, and ATAPI devices. SCSI packets are + also commonly exchanged over Infiniband, + <ulink url='http://i2o.shadowconnect.com/faq.php'>I20</ulink>, TCP/IP + (<ulink url='http://en.wikipedia.org/wiki/ISCSI'>iSCSI</ulink>), even + <ulink url='http://cyberelk.net/tim/parport/parscsi.html'>Parallel + ports</ulink>. + </para> + </sect1> + <sect1 id="subsystem_design"> + <title>Design of the Linux SCSI subsystem</title> + <para> + The SCSI subsystem uses a three layer design, with upper, mid, and low + layers. Every operation involving the SCSI subsystem (such as reading + a sector from a disk) uses one driver at each of the 3 levels: one + upper layer driver, one lower layer driver, and the SCSI midlayer. + </para> + <para> + The SCSI upper layer provides the interface between userspace and the + kernel, in the form of block and char device nodes for I/O and + ioctl(). The SCSI lower layer contains drivers for specific hardware + devices. + </para> + <para> + In between is the SCSI mid-layer, analogous to a network routing + layer such as the IPv4 stack. The SCSI mid-layer routes a packet + based data protocol between the upper layer's /dev nodes and the + corresponding devices in the lower layer. It manages command queues, + provides error handling and power management functions, and responds + to ioctl() requests. + </para> + </sect1> + </chapter> + + <chapter id="upper_layer"> + <title>SCSI upper layer</title> + <para> + The upper layer supports the user-kernel interface by providing + device nodes. + </para> + <sect1 id="sd"> + <title>sd (SCSI Disk)</title> + <para>sd (sd_mod.o)</para> +<!-- !Idrivers/scsi/sd.c --> + </sect1> + <sect1 id="sr"> + <title>sr (SCSI CD-ROM)</title> + <para>sr (sr_mod.o)</para> + </sect1> + <sect1 id="st"> + <title>st (SCSI Tape)</title> + <para>st (st.o)</para> + </sect1> + <sect1 id="sg"> + <title>sg (SCSI Generic)</title> + <para>sg (sg.o)</para> + </sect1> + <sect1 id="ch"> + <title>ch (SCSI Media Changer)</title> + <para>ch (ch.c)</para> + </sect1> + </chapter> + + <chapter id="mid_layer"> + <title>SCSI mid layer</title> + + <sect1 id="midlayer_implementation"> + <title>SCSI midlayer implementation</title> + <sect2 id="scsi_device.h"> + <title>include/scsi/scsi_device.h</title> + <para> + </para> +!Iinclude/scsi/scsi_device.h + </sect2> + + <sect2 id="scsi.c"> + <title>drivers/scsi/scsi.c</title> + <para>Main file for the SCSI midlayer.</para> +!Edrivers/scsi/scsi.c + </sect2> + <sect2 id="scsicam.c"> + <title>drivers/scsi/scsicam.c</title> + <para> + <ulink url='http://www.t10.org/ftp/t10/drafts/cam/cam-r12b.pdf'>SCSI + Common Access Method</ulink> support functions, for use with + HDIO_GETGEO, etc. + </para> +!Edrivers/scsi/scsicam.c + </sect2> + <sect2 id="scsi_error.c"> + <title>drivers/scsi/scsi_error.c</title> + <para>Common SCSI error/timeout handling routines.</para> +!Edrivers/scsi/scsi_error.c + </sect2> + <sect2 id="scsi_devinfo.c"> + <title>drivers/scsi/scsi_devinfo.c</title> + <para> + Manage scsi_dev_info_list, which tracks blacklisted and whitelisted + devices. + </para> +!Idrivers/scsi/scsi_devinfo.c + </sect2> + <sect2 id="scsi_ioctl.c"> + <title>drivers/scsi/scsi_ioctl.c</title> + <para> + Handle ioctl() calls for SCSI devices. + </para> +!Edrivers/scsi/scsi_ioctl.c + </sect2> + <sect2 id="scsi_lib.c"> + <title>drivers/scsi/scsi_lib.c</title> + <para> + SCSI queuing library. + </para> +!Edrivers/scsi/scsi_lib.c + </sect2> + <sect2 id="scsi_lib_dma.c"> + <title>drivers/scsi/scsi_lib_dma.c</title> + <para> + SCSI library functions depending on DMA + (map and unmap scatter-gather lists). + </para> +!Edrivers/scsi/scsi_lib_dma.c + </sect2> + <sect2 id="scsi_module.c"> + <title>drivers/scsi/scsi_module.c</title> + <para> + The file drivers/scsi/scsi_module.c contains legacy support for + old-style host templates. It should never be used by any new driver. + </para> + </sect2> + <sect2 id="scsi_proc.c"> + <title>drivers/scsi/scsi_proc.c</title> + <para> + The functions in this file provide an interface between + the PROC file system and the SCSI device drivers + It is mainly used for debugging, statistics and to pass + information directly to the lowlevel driver. + + I.E. plumbing to manage /proc/scsi/* + </para> +!Idrivers/scsi/scsi_proc.c + </sect2> + <sect2 id="scsi_netlink.c"> + <title>drivers/scsi/scsi_netlink.c</title> + <para> + Infrastructure to provide async events from transports to userspace + via netlink, using a single NETLINK_SCSITRANSPORT protocol for all + transports. + + See <ulink url='http://marc.info/?l=linux-scsi&m=115507374832500&w=2'>the + original patch submission</ulink> for more details. + </para> +!Idrivers/scsi/scsi_netlink.c + </sect2> + <sect2 id="scsi_scan.c"> + <title>drivers/scsi/scsi_scan.c</title> + <para> + Scan a host to determine which (if any) devices are attached. + + The general scanning/probing algorithm is as follows, exceptions are + made to it depending on device specific flags, compilation options, + and global variable (boot or module load time) settings. + + A specific LUN is scanned via an INQUIRY command; if the LUN has a + device attached, a scsi_device is allocated and setup for it. + + For every id of every channel on the given host, start by scanning + LUN 0. Skip hosts that don't respond at all to a scan of LUN 0. + Otherwise, if LUN 0 has a device attached, allocate and setup a + scsi_device for it. If target is SCSI-3 or up, issue a REPORT LUN, + and scan all of the LUNs returned by the REPORT LUN; else, + sequentially scan LUNs up until some maximum is reached, or a LUN is + seen that cannot have a device attached to it. + </para> +!Idrivers/scsi/scsi_scan.c + </sect2> + <sect2 id="scsi_sysctl.c"> + <title>drivers/scsi/scsi_sysctl.c</title> + <para> + Set up the sysctl entry: "/dev/scsi/logging_level" + (DEV_SCSI_LOGGING_LEVEL) which sets/returns scsi_logging_level. + </para> + </sect2> + <sect2 id="scsi_sysfs.c"> + <title>drivers/scsi/scsi_sysfs.c</title> + <para> + SCSI sysfs interface routines. + </para> +!Edrivers/scsi/scsi_sysfs.c + </sect2> + <sect2 id="hosts.c"> + <title>drivers/scsi/hosts.c</title> + <para> + mid to lowlevel SCSI driver interface + </para> +!Edrivers/scsi/hosts.c + </sect2> + <sect2 id="constants.c"> + <title>drivers/scsi/constants.c</title> + <para> + mid to lowlevel SCSI driver interface + </para> +!Edrivers/scsi/constants.c + </sect2> + </sect1> + + <sect1 id="Transport_classes"> + <title>Transport classes</title> + <para> + Transport classes are service libraries for drivers in the SCSI + lower layer, which expose transport attributes in sysfs. + </para> + <sect2 id="Fibre_Channel_transport"> + <title>Fibre Channel transport</title> + <para> + The file drivers/scsi/scsi_transport_fc.c defines transport attributes + for Fibre Channel. + </para> +!Edrivers/scsi/scsi_transport_fc.c + </sect2> + <sect2 id="iSCSI_transport"> + <title>iSCSI transport class</title> + <para> + The file drivers/scsi/scsi_transport_iscsi.c defines transport + attributes for the iSCSI class, which sends SCSI packets over TCP/IP + connections. + </para> +!Edrivers/scsi/scsi_transport_iscsi.c + </sect2> + <sect2 id="SAS_transport"> + <title>Serial Attached SCSI (SAS) transport class</title> + <para> + The file drivers/scsi/scsi_transport_sas.c defines transport + attributes for Serial Attached SCSI, a variant of SATA aimed at + large high-end systems. + </para> + <para> + The SAS transport class contains common code to deal with SAS HBAs, + an aproximated representation of SAS topologies in the driver model, + and various sysfs attributes to expose these topologies and managment + interfaces to userspace. + </para> + <para> + In addition to the basic SCSI core objects this transport class + introduces two additional intermediate objects: The SAS PHY + as represented by struct sas_phy defines an "outgoing" PHY on + a SAS HBA or Expander, and the SAS remote PHY represented by + struct sas_rphy defines an "incoming" PHY on a SAS Expander or + end device. Note that this is purely a software concept, the + underlying hardware for a PHY and a remote PHY is the exactly + the same. + </para> + <para> + There is no concept of a SAS port in this code, users can see + what PHYs form a wide port based on the port_identifier attribute, + which is the same for all PHYs in a port. + </para> +!Edrivers/scsi/scsi_transport_sas.c + </sect2> + <sect2 id="SATA_transport"> + <title>SATA transport class</title> + <para> + The SATA transport is handled by libata, which has its own book of + documentation in this directory. + </para> + </sect2> + <sect2 id="SPI_transport"> + <title>Parallel SCSI (SPI) transport class</title> + <para> + The file drivers/scsi/scsi_transport_spi.c defines transport + attributes for traditional (fast/wide/ultra) SCSI busses. + </para> +!Edrivers/scsi/scsi_transport_spi.c + </sect2> + <sect2 id="SRP_transport"> + <title>SCSI RDMA (SRP) transport class</title> + <para> + The file drivers/scsi/scsi_transport_srp.c defines transport + attributes for SCSI over Remote Direct Memory Access. + </para> +!Edrivers/scsi/scsi_transport_srp.c + </sect2> + </sect1> + + </chapter> + + <chapter id="lower_layer"> + <title>SCSI lower layer</title> + <sect1 id="hba_drivers"> + <title>Host Bus Adapter transport types</title> + <para> + Many modern device controllers use the SCSI command set as a protocol to + communicate with their devices through many different types of physical + connections. + </para> + <para> + In SCSI language a bus capable of carrying SCSI commands is + called a "transport", and a controller connecting to such a bus is + called a "host bus adapter" (HBA). + </para> + <sect2 id="scsi_debug.c"> + <title>Debug transport</title> + <para> + The file drivers/scsi/scsi_debug.c simulates a host adapter with a + variable number of disks (or disk like devices) attached, sharing a + common amount of RAM. Does a lot of checking to make sure that we are + not getting blocks mixed up, and panics the kernel if anything out of + the ordinary is seen. + </para> + <para> + To be more realistic, the simulated devices have the transport + attributes of SAS disks. + </para> + <para> + For documentation see + <ulink url='http://www.torque.net/sg/sdebug26.html'>http://www.torque.net/sg/sdebug26.html</ulink> + </para> +<!-- !Edrivers/scsi/scsi_debug.c --> + </sect2> + <sect2 id="todo"> + <title>todo</title> + <para>Parallel (fast/wide/ultra) SCSI, USB, SATA, + SAS, Fibre Channel, FireWire, ATAPI devices, Infiniband, + I20, iSCSI, Parallel ports, netlink... + </para> + </sect2> + </sect1> + </chapter> +</book> diff --git a/Documentation/DocBook/sh.tmpl b/Documentation/DocBook/sh.tmpl new file mode 100644 index 0000000..0c3dc4c --- /dev/null +++ b/Documentation/DocBook/sh.tmpl @@ -0,0 +1,105 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="sh-drivers"> + <bookinfo> + <title>SuperH Interfaces Guide</title> + + <authorgroup> + <author> + <firstname>Paul</firstname> + <surname>Mundt</surname> + <affiliation> + <address> + <email>lethal@linux-sh.org</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2008</year> + <holder>Paul Mundt</holder> + </copyright> + <copyright> + <year>2008</year> + <holder>Renesas Technology Corp.</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License version 2 as published by the Free Software Foundation. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="mm"> + <title>Memory Management</title> + <sect1 id="sh4"> + <title>SH-4</title> + <sect2 id="sq"> + <title>Store Queue API</title> +!Earch/sh/kernel/cpu/sh4/sq.c + </sect2> + </sect1> + <sect1 id="sh5"> + <title>SH-5</title> + <sect2 id="tlb"> + <title>TLB Interfaces</title> +!Iarch/sh/mm/tlb-sh5.c +!Iarch/sh/include/asm/tlb_64.h + </sect2> + </sect1> + </chapter> + <chapter id="clk"> + <title>Clock Framework Extensions</title> +!Iarch/sh/include/asm/clock.h + </chapter> + <chapter id="mach"> + <title>Machine Specific Interfaces</title> + <sect1 id="dreamcast"> + <title>mach-dreamcast</title> +!Iarch/sh/boards/mach-dreamcast/rtc.c + </sect1> + <sect1 id="x3proto"> + <title>mach-x3proto</title> +!Earch/sh/boards/mach-x3proto/ilsel.c + </sect1> + </chapter> + <chapter id="busses"> + <title>Busses</title> + <sect1 id="superhyway"> + <title>SuperHyway</title> +!Edrivers/sh/superhyway/superhyway.c + </sect1> + + <sect1 id="maple"> + <title>Maple</title> +!Edrivers/sh/maple/maple.c + </sect1> + </chapter> +</book> diff --git a/Documentation/DocBook/stylesheet.xsl b/Documentation/DocBook/stylesheet.xsl new file mode 100644 index 0000000..974e17c --- /dev/null +++ b/Documentation/DocBook/stylesheet.xsl @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<stylesheet xmlns="http://www.w3.org/1999/XSL/Transform" version="1.0"> +<param name="chunk.quietly">1</param> +<param name="funcsynopsis.style">ansi</param> +<param name="funcsynopsis.tabular.threshold">80</param> +<!-- <param name="paper.type">A4</param> --> +<param name="generate.section.toc.level">2</param> +</stylesheet> diff --git a/Documentation/DocBook/uio-howto.tmpl b/Documentation/DocBook/uio-howto.tmpl new file mode 100644 index 0000000..df87d1b --- /dev/null +++ b/Documentation/DocBook/uio-howto.tmpl @@ -0,0 +1,620 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" +"http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd" []> + +<book id="index"> +<bookinfo> +<title>The Userspace I/O HOWTO</title> + +<author> + <firstname>Hans-Jürgen</firstname> + <surname>Koch</surname> + <authorblurb><para>Linux developer, Linutronix</para></authorblurb> + <affiliation> + <orgname> + <ulink url="http://www.linutronix.de">Linutronix</ulink> + </orgname> + + <address> + <email>hjk@linutronix.de</email> + </address> + </affiliation> +</author> + +<copyright> + <year>2006-2008</year> + <holder>Hans-Jürgen Koch.</holder> +</copyright> + +<legalnotice> +<para> +This documentation is Free Software licensed under the terms of the +GPL version 2. +</para> +</legalnotice> + +<pubdate>2006-12-11</pubdate> + +<abstract> + <para>This HOWTO describes concept and usage of Linux kernel's + Userspace I/O system.</para> +</abstract> + +<revhistory> + <revision> + <revnumber>0.5</revnumber> + <date>2008-05-22</date> + <authorinitials>hjk</authorinitials> + <revremark>Added description of write() function.</revremark> + </revision> + <revision> + <revnumber>0.4</revnumber> + <date>2007-11-26</date> + <authorinitials>hjk</authorinitials> + <revremark>Removed section about uio_dummy.</revremark> + </revision> + <revision> + <revnumber>0.3</revnumber> + <date>2007-04-29</date> + <authorinitials>hjk</authorinitials> + <revremark>Added section about userspace drivers.</revremark> + </revision> + <revision> + <revnumber>0.2</revnumber> + <date>2007-02-13</date> + <authorinitials>hjk</authorinitials> + <revremark>Update after multiple mappings were added.</revremark> + </revision> + <revision> + <revnumber>0.1</revnumber> + <date>2006-12-11</date> + <authorinitials>hjk</authorinitials> + <revremark>First draft.</revremark> + </revision> +</revhistory> +</bookinfo> + +<chapter id="aboutthisdoc"> +<?dbhtml filename="aboutthis.html"?> +<title>About this document</title> + +<sect1 id="translations"> +<?dbhtml filename="translations.html"?> +<title>Translations</title> + +<para>If you know of any translations for this document, or you are +interested in translating it, please email me +<email>hjk@linutronix.de</email>. +</para> +</sect1> + +<sect1 id="preface"> +<title>Preface</title> + <para> + For many types of devices, creating a Linux kernel driver is + overkill. All that is really needed is some way to handle an + interrupt and provide access to the memory space of the + device. The logic of controlling the device does not + necessarily have to be within the kernel, as the device does + not need to take advantage of any of other resources that the + kernel provides. One such common class of devices that are + like this are for industrial I/O cards. + </para> + <para> + To address this situation, the userspace I/O system (UIO) was + designed. For typical industrial I/O cards, only a very small + kernel module is needed. The main part of the driver will run in + user space. This simplifies development and reduces the risk of + serious bugs within a kernel module. + </para> + <para> + Please note that UIO is not an universal driver interface. Devices + that are already handled well by other kernel subsystems (like + networking or serial or USB) are no candidates for an UIO driver. + Hardware that is ideally suited for an UIO driver fulfills all of + the following: + </para> +<itemizedlist> +<listitem> + <para>The device has memory that can be mapped. The device can be + controlled completely by writing to this memory.</para> +</listitem> +<listitem> + <para>The device usually generates interrupts.</para> +</listitem> +<listitem> + <para>The device does not fit into one of the standard kernel + subsystems.</para> +</listitem> +</itemizedlist> +</sect1> + +<sect1 id="thanks"> +<title>Acknowledgments</title> + <para>I'd like to thank Thomas Gleixner and Benedikt Spranger of + Linutronix, who have not only written most of the UIO code, but also + helped greatly writing this HOWTO by giving me all kinds of background + information.</para> +</sect1> + +<sect1 id="feedback"> +<title>Feedback</title> + <para>Find something wrong with this document? (Or perhaps something + right?) I would love to hear from you. Please email me at + <email>hjk@linutronix.de</email>.</para> +</sect1> +</chapter> + +<chapter id="about"> +<?dbhtml filename="about.html"?> +<title>About UIO</title> + +<para>If you use UIO for your card's driver, here's what you get:</para> + +<itemizedlist> +<listitem> + <para>only one small kernel module to write and maintain.</para> +</listitem> +<listitem> + <para>develop the main part of your driver in user space, + with all the tools and libraries you're used to.</para> +</listitem> +<listitem> + <para>bugs in your driver won't crash the kernel.</para> +</listitem> +<listitem> + <para>updates of your driver can take place without recompiling + the kernel.</para> +</listitem> +</itemizedlist> + +<sect1 id="how_uio_works"> +<title>How UIO works</title> + <para> + Each UIO device is accessed through a device file and several + sysfs attribute files. The device file will be called + <filename>/dev/uio0</filename> for the first device, and + <filename>/dev/uio1</filename>, <filename>/dev/uio2</filename> + and so on for subsequent devices. + </para> + + <para><filename>/dev/uioX</filename> is used to access the + address space of the card. Just use + <function>mmap()</function> to access registers or RAM + locations of your card. + </para> + + <para> + Interrupts are handled by reading from + <filename>/dev/uioX</filename>. A blocking + <function>read()</function> from + <filename>/dev/uioX</filename> will return as soon as an + interrupt occurs. You can also use + <function>select()</function> on + <filename>/dev/uioX</filename> to wait for an interrupt. The + integer value read from <filename>/dev/uioX</filename> + represents the total interrupt count. You can use this number + to figure out if you missed some interrupts. + </para> + <para> + For some hardware that has more than one interrupt source internally, + but not separate IRQ mask and status registers, there might be + situations where userspace cannot determine what the interrupt source + was if the kernel handler disables them by writing to the chip's IRQ + register. In such a case, the kernel has to disable the IRQ completely + to leave the chip's register untouched. Now the userspace part can + determine the cause of the interrupt, but it cannot re-enable + interrupts. Another cornercase is chips where re-enabling interrupts + is a read-modify-write operation to a combined IRQ status/acknowledge + register. This would be racy if a new interrupt occurred + simultaneously. + </para> + <para> + To address these problems, UIO also implements a write() function. It + is normally not used and can be ignored for hardware that has only a + single interrupt source or has separate IRQ mask and status registers. + If you need it, however, a write to <filename>/dev/uioX</filename> + will call the <function>irqcontrol()</function> function implemented + by the driver. You have to write a 32-bit value that is usually either + 0 or 1 to disable or enable interrupts. If a driver does not implement + <function>irqcontrol()</function>, <function>write()</function> will + return with <varname>-ENOSYS</varname>. + </para> + + <para> + To handle interrupts properly, your custom kernel module can + provide its own interrupt handler. It will automatically be + called by the built-in handler. + </para> + + <para> + For cards that don't generate interrupts but need to be + polled, there is the possibility to set up a timer that + triggers the interrupt handler at configurable time intervals. + This interrupt simulation is done by calling + <function>uio_event_notify()</function> + from the timer's event handler. + </para> + + <para> + Each driver provides attributes that are used to read or write + variables. These attributes are accessible through sysfs + files. A custom kernel driver module can add its own + attributes to the device owned by the uio driver, but not added + to the UIO device itself at this time. This might change in the + future if it would be found to be useful. + </para> + + <para> + The following standard attributes are provided by the UIO + framework: + </para> +<itemizedlist> +<listitem> + <para> + <filename>name</filename>: The name of your device. It is + recommended to use the name of your kernel module for this. + </para> +</listitem> +<listitem> + <para> + <filename>version</filename>: A version string defined by your + driver. This allows the user space part of your driver to deal + with different versions of the kernel module. + </para> +</listitem> +<listitem> + <para> + <filename>event</filename>: The total number of interrupts + handled by the driver since the last time the device node was + read. + </para> +</listitem> +</itemizedlist> +<para> + These attributes appear under the + <filename>/sys/class/uio/uioX</filename> directory. Please + note that this directory might be a symlink, and not a real + directory. Any userspace code that accesses it must be able + to handle this. +</para> +<para> + Each UIO device can make one or more memory regions available for + memory mapping. This is necessary because some industrial I/O cards + require access to more than one PCI memory region in a driver. +</para> +<para> + Each mapping has its own directory in sysfs, the first mapping + appears as <filename>/sys/class/uio/uioX/maps/map0/</filename>. + Subsequent mappings create directories <filename>map1/</filename>, + <filename>map2/</filename>, and so on. These directories will only + appear if the size of the mapping is not 0. +</para> +<para> + Each <filename>mapX/</filename> directory contains two read-only files + that show start address and size of the memory: +</para> +<itemizedlist> +<listitem> + <para> + <filename>addr</filename>: The address of memory that can be mapped. + </para> +</listitem> +<listitem> + <para> + <filename>size</filename>: The size, in bytes, of the memory + pointed to by addr. + </para> +</listitem> +</itemizedlist> + +<para> + From userspace, the different mappings are distinguished by adjusting + the <varname>offset</varname> parameter of the + <function>mmap()</function> call. To map the memory of mapping N, you + have to use N times the page size as your offset: +</para> +<programlisting format="linespecific"> +offset = N * getpagesize(); +</programlisting> + +</sect1> +</chapter> + +<chapter id="custom_kernel_module" xreflabel="Writing your own kernel module"> +<?dbhtml filename="custom_kernel_module.html"?> +<title>Writing your own kernel module</title> + <para> + Please have a look at <filename>uio_cif.c</filename> as an + example. The following paragraphs explain the different + sections of this file. + </para> + +<sect1 id="uio_info"> +<title>struct uio_info</title> + <para> + This structure tells the framework the details of your driver, + Some of the members are required, others are optional. + </para> + +<itemizedlist> +<listitem><para> +<varname>char *name</varname>: Required. The name of your driver as +it will appear in sysfs. I recommend using the name of your module for this. +</para></listitem> + +<listitem><para> +<varname>char *version</varname>: Required. This string appears in +<filename>/sys/class/uio/uioX/version</filename>. +</para></listitem> + +<listitem><para> +<varname>struct uio_mem mem[ MAX_UIO_MAPS ]</varname>: Required if you +have memory that can be mapped with <function>mmap()</function>. For each +mapping you need to fill one of the <varname>uio_mem</varname> structures. +See the description below for details. +</para></listitem> + +<listitem><para> +<varname>long irq</varname>: Required. If your hardware generates an +interrupt, it's your modules task to determine the irq number during +initialization. If you don't have a hardware generated interrupt but +want to trigger the interrupt handler in some other way, set +<varname>irq</varname> to <varname>UIO_IRQ_CUSTOM</varname>. +If you had no interrupt at all, you could set +<varname>irq</varname> to <varname>UIO_IRQ_NONE</varname>, though this +rarely makes sense. +</para></listitem> + +<listitem><para> +<varname>unsigned long irq_flags</varname>: Required if you've set +<varname>irq</varname> to a hardware interrupt number. The flags given +here will be used in the call to <function>request_irq()</function>. +</para></listitem> + +<listitem><para> +<varname>int (*mmap)(struct uio_info *info, struct vm_area_struct +*vma)</varname>: Optional. If you need a special +<function>mmap()</function> function, you can set it here. If this +pointer is not NULL, your <function>mmap()</function> will be called +instead of the built-in one. +</para></listitem> + +<listitem><para> +<varname>int (*open)(struct uio_info *info, struct inode *inode) +</varname>: Optional. You might want to have your own +<function>open()</function>, e.g. to enable interrupts only when your +device is actually used. +</para></listitem> + +<listitem><para> +<varname>int (*release)(struct uio_info *info, struct inode *inode) +</varname>: Optional. If you define your own +<function>open()</function>, you will probably also want a custom +<function>release()</function> function. +</para></listitem> + +<listitem><para> +<varname>int (*irqcontrol)(struct uio_info *info, s32 irq_on) +</varname>: Optional. If you need to be able to enable or disable +interrupts from userspace by writing to <filename>/dev/uioX</filename>, +you can implement this function. The parameter <varname>irq_on</varname> +will be 0 to disable interrupts and 1 to enable them. +</para></listitem> +</itemizedlist> + +<para> +Usually, your device will have one or more memory regions that can be mapped +to user space. For each region, you have to set up a +<varname>struct uio_mem</varname> in the <varname>mem[]</varname> array. +Here's a description of the fields of <varname>struct uio_mem</varname>: +</para> + +<itemizedlist> +<listitem><para> +<varname>int memtype</varname>: Required if the mapping is used. Set this to +<varname>UIO_MEM_PHYS</varname> if you you have physical memory on your +card to be mapped. Use <varname>UIO_MEM_LOGICAL</varname> for logical +memory (e.g. allocated with <function>kmalloc()</function>). There's also +<varname>UIO_MEM_VIRTUAL</varname> for virtual memory. +</para></listitem> + +<listitem><para> +<varname>unsigned long addr</varname>: Required if the mapping is used. +Fill in the address of your memory block. This address is the one that +appears in sysfs. +</para></listitem> + +<listitem><para> +<varname>unsigned long size</varname>: Fill in the size of the +memory block that <varname>addr</varname> points to. If <varname>size</varname> +is zero, the mapping is considered unused. Note that you +<emphasis>must</emphasis> initialize <varname>size</varname> with zero for +all unused mappings. +</para></listitem> + +<listitem><para> +<varname>void *internal_addr</varname>: If you have to access this memory +region from within your kernel module, you will want to map it internally by +using something like <function>ioremap()</function>. Addresses +returned by this function cannot be mapped to user space, so you must not +store it in <varname>addr</varname>. Use <varname>internal_addr</varname> +instead to remember such an address. +</para></listitem> +</itemizedlist> + +<para> +Please do not touch the <varname>kobj</varname> element of +<varname>struct uio_mem</varname>! It is used by the UIO framework +to set up sysfs files for this mapping. Simply leave it alone. +</para> +</sect1> + +<sect1 id="adding_irq_handler"> +<title>Adding an interrupt handler</title> + <para> + What you need to do in your interrupt handler depends on your + hardware and on how you want to handle it. You should try to + keep the amount of code in your kernel interrupt handler low. + If your hardware requires no action that you + <emphasis>have</emphasis> to perform after each interrupt, + then your handler can be empty.</para> <para>If, on the other + hand, your hardware <emphasis>needs</emphasis> some action to + be performed after each interrupt, then you + <emphasis>must</emphasis> do it in your kernel module. Note + that you cannot rely on the userspace part of your driver. Your + userspace program can terminate at any time, possibly leaving + your hardware in a state where proper interrupt handling is + still required. + </para> + + <para> + There might also be applications where you want to read data + from your hardware at each interrupt and buffer it in a piece + of kernel memory you've allocated for that purpose. With this + technique you could avoid loss of data if your userspace + program misses an interrupt. + </para> + + <para> + A note on shared interrupts: Your driver should support + interrupt sharing whenever this is possible. It is possible if + and only if your driver can detect whether your hardware has + triggered the interrupt or not. This is usually done by looking + at an interrupt status register. If your driver sees that the + IRQ bit is actually set, it will perform its actions, and the + handler returns IRQ_HANDLED. If the driver detects that it was + not your hardware that caused the interrupt, it will do nothing + and return IRQ_NONE, allowing the kernel to call the next + possible interrupt handler. + </para> + + <para> + If you decide not to support shared interrupts, your card + won't work in computers with no free interrupts. As this + frequently happens on the PC platform, you can save yourself a + lot of trouble by supporting interrupt sharing. + </para> +</sect1> + +</chapter> + +<chapter id="userspace_driver" xreflabel="Writing a driver in user space"> +<?dbhtml filename="userspace_driver.html"?> +<title>Writing a driver in userspace</title> + <para> + Once you have a working kernel module for your hardware, you can + write the userspace part of your driver. You don't need any special + libraries, your driver can be written in any reasonable language, + you can use floating point numbers and so on. In short, you can + use all the tools and libraries you'd normally use for writing a + userspace application. + </para> + +<sect1 id="getting_uio_information"> +<title>Getting information about your UIO device</title> + <para> + Information about all UIO devices is available in sysfs. The + first thing you should do in your driver is check + <varname>name</varname> and <varname>version</varname> to + make sure your talking to the right device and that its kernel + driver has the version you expect. + </para> + <para> + You should also make sure that the memory mapping you need + exists and has the size you expect. + </para> + <para> + There is a tool called <varname>lsuio</varname> that lists + UIO devices and their attributes. It is available here: + </para> + <para> + <ulink url="http://www.osadl.org/projects/downloads/UIO/user/"> + http://www.osadl.org/projects/downloads/UIO/user/</ulink> + </para> + <para> + With <varname>lsuio</varname> you can quickly check if your + kernel module is loaded and which attributes it exports. + Have a look at the manpage for details. + </para> + <para> + The source code of <varname>lsuio</varname> can serve as an + example for getting information about an UIO device. + The file <filename>uio_helper.c</filename> contains a lot of + functions you could use in your userspace driver code. + </para> +</sect1> + +<sect1 id="mmap_device_memory"> +<title>mmap() device memory</title> + <para> + After you made sure you've got the right device with the + memory mappings you need, all you have to do is to call + <function>mmap()</function> to map the device's memory + to userspace. + </para> + <para> + The parameter <varname>offset</varname> of the + <function>mmap()</function> call has a special meaning + for UIO devices: It is used to select which mapping of + your device you want to map. To map the memory of + mapping N, you have to use N times the page size as + your offset: + </para> +<programlisting format="linespecific"> + offset = N * getpagesize(); +</programlisting> + <para> + N starts from zero, so if you've got only one memory + range to map, set <varname>offset = 0</varname>. + A drawback of this technique is that memory is always + mapped beginning with its start address. + </para> +</sect1> + +<sect1 id="wait_for_interrupts"> +<title>Waiting for interrupts</title> + <para> + After you successfully mapped your devices memory, you + can access it like an ordinary array. Usually, you will + perform some initialization. After that, your hardware + starts working and will generate an interrupt as soon + as it's finished, has some data available, or needs your + attention because an error occured. + </para> + <para> + <filename>/dev/uioX</filename> is a read-only file. A + <function>read()</function> will always block until an + interrupt occurs. There is only one legal value for the + <varname>count</varname> parameter of + <function>read()</function>, and that is the size of a + signed 32 bit integer (4). Any other value for + <varname>count</varname> causes <function>read()</function> + to fail. The signed 32 bit integer read is the interrupt + count of your device. If the value is one more than the value + you read the last time, everything is OK. If the difference + is greater than one, you missed interrupts. + </para> + <para> + You can also use <function>select()</function> on + <filename>/dev/uioX</filename>. + </para> +</sect1> + +</chapter> + +<appendix id="app1"> +<title>Further information</title> +<itemizedlist> + <listitem><para> + <ulink url="http://www.osadl.org"> + OSADL homepage.</ulink> + </para></listitem> + <listitem><para> + <ulink url="http://www.linutronix.de"> + Linutronix homepage.</ulink> + </para></listitem> +</itemizedlist> +</appendix> + +</book> diff --git a/Documentation/DocBook/usb.tmpl b/Documentation/DocBook/usb.tmpl new file mode 100644 index 0000000..af29360 --- /dev/null +++ b/Documentation/DocBook/usb.tmpl @@ -0,0 +1,980 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="Linux-USB-API"> + <bookinfo> + <title>The Linux-USB Host Side API</title> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + +<chapter id="intro"> + <title>Introduction to USB on Linux</title> + + <para>A Universal Serial Bus (USB) is used to connect a host, + such as a PC or workstation, to a number of peripheral + devices. USB uses a tree structure, with the host as the + root (the system's master), hubs as interior nodes, and + peripherals as leaves (and slaves). + Modern PCs support several such trees of USB devices, usually + one USB 2.0 tree (480 Mbit/sec each) with + a few USB 1.1 trees (12 Mbit/sec each) that are used when you + connect a USB 1.1 device directly to the machine's "root hub". + </para> + + <para>That master/slave asymmetry was designed-in for a number of + reasons, one being ease of use. It is not physically possible to + assemble (legal) USB cables incorrectly: all upstream "to the host" + connectors are the rectangular type (matching the sockets on + root hubs), and all downstream connectors are the squarish type + (or they are built into the peripheral). + Also, the host software doesn't need to deal with distributed + auto-configuration since the pre-designated master node manages all that. + And finally, at the electrical level, bus protocol overhead is reduced by + eliminating arbitration and moving scheduling into the host software. + </para> + + <para>USB 1.0 was announced in January 1996 and was revised + as USB 1.1 (with improvements in hub specification and + support for interrupt-out transfers) in September 1998. + USB 2.0 was released in April 2000, adding high-speed + transfers and transaction-translating hubs (used for USB 1.1 + and 1.0 backward compatibility). + </para> + + <para>Kernel developers added USB support to Linux early in the 2.2 kernel + series, shortly before 2.3 development forked. Updates from 2.3 were + regularly folded back into 2.2 releases, which improved reliability and + brought <filename>/sbin/hotplug</filename> support as well more drivers. + Such improvements were continued in the 2.5 kernel series, where they added + USB 2.0 support, improved performance, and made the host controller drivers + (HCDs) more consistent. They also simplified the API (to make bugs less + likely) and added internal "kerneldoc" documentation. + </para> + + <para>Linux can run inside USB devices as well as on + the hosts that control the devices. + But USB device drivers running inside those peripherals + don't do the same things as the ones running inside hosts, + so they've been given a different name: + <emphasis>gadget drivers</emphasis>. + This document does not cover gadget drivers. + </para> + + </chapter> + +<chapter id="host"> + <title>USB Host-Side API Model</title> + + <para>Host-side drivers for USB devices talk to the "usbcore" APIs. + There are two. One is intended for + <emphasis>general-purpose</emphasis> drivers (exposed through + driver frameworks), and the other is for drivers that are + <emphasis>part of the core</emphasis>. + Such core drivers include the <emphasis>hub</emphasis> driver + (which manages trees of USB devices) and several different kinds + of <emphasis>host controller drivers</emphasis>, + which control individual busses. + </para> + + <para>The device model seen by USB drivers is relatively complex. + </para> + + <itemizedlist> + + <listitem><para>USB supports four kinds of data transfers + (control, bulk, interrupt, and isochronous). Two of them (control + and bulk) use bandwidth as it's available, + while the other two (interrupt and isochronous) + are scheduled to provide guaranteed bandwidth. + </para></listitem> + + <listitem><para>The device description model includes one or more + "configurations" per device, only one of which is active at a time. + Devices that are capable of high-speed operation must also support + full-speed configurations, along with a way to ask about the + "other speed" configurations which might be used. + </para></listitem> + + <listitem><para>Configurations have one or more "interfaces", each + of which may have "alternate settings". Interfaces may be + standardized by USB "Class" specifications, or may be specific to + a vendor or device.</para> + + <para>USB device drivers actually bind to interfaces, not devices. + Think of them as "interface drivers", though you + may not see many devices where the distinction is important. + <emphasis>Most USB devices are simple, with only one configuration, + one interface, and one alternate setting.</emphasis> + </para></listitem> + + <listitem><para>Interfaces have one or more "endpoints", each of + which supports one type and direction of data transfer such as + "bulk out" or "interrupt in". The entire configuration may have + up to sixteen endpoints in each direction, allocated as needed + among all the interfaces. + </para></listitem> + + <listitem><para>Data transfer on USB is packetized; each endpoint + has a maximum packet size. + Drivers must often be aware of conventions such as flagging the end + of bulk transfers using "short" (including zero length) packets. + </para></listitem> + + <listitem><para>The Linux USB API supports synchronous calls for + control and bulk messages. + It also supports asynchnous calls for all kinds of data transfer, + using request structures called "URBs" (USB Request Blocks). + </para></listitem> + + </itemizedlist> + + <para>Accordingly, the USB Core API exposed to device drivers + covers quite a lot of territory. You'll probably need to consult + the USB 2.0 specification, available online from www.usb.org at + no cost, as well as class or device specifications. + </para> + + <para>The only host-side drivers that actually touch hardware + (reading/writing registers, handling IRQs, and so on) are the HCDs. + In theory, all HCDs provide the same functionality through the same + API. In practice, that's becoming more true on the 2.5 kernels, + but there are still differences that crop up especially with + fault handling. Different controllers don't necessarily report + the same aspects of failures, and recovery from faults (including + software-induced ones like unlinking an URB) isn't yet fully + consistent. + Device driver authors should make a point of doing disconnect + testing (while the device is active) with each different host + controller driver, to make sure drivers don't have bugs of + their own as well as to make sure they aren't relying on some + HCD-specific behavior. + (You will need external USB 1.1 and/or + USB 2.0 hubs to perform all those tests.) + </para> + + </chapter> + +<chapter id="types"><title>USB-Standard Types</title> + + <para>In <filename><linux/usb/ch9.h></filename> you will find + the USB data types defined in chapter 9 of the USB specification. + These data types are used throughout USB, and in APIs including + this host side API, gadget APIs, and usbfs. + </para> + +!Iinclude/linux/usb/ch9.h + + </chapter> + +<chapter id="hostside"><title>Host-Side Data Types and Macros</title> + + <para>The host side API exposes several layers to drivers, some of + which are more necessary than others. + These support lifecycle models for host side drivers + and devices, and support passing buffers through usbcore to + some HCD that performs the I/O for the device driver. + </para> + + +!Iinclude/linux/usb.h + + </chapter> + + <chapter id="usbcore"><title>USB Core APIs</title> + + <para>There are two basic I/O models in the USB API. + The most elemental one is asynchronous: drivers submit requests + in the form of an URB, and the URB's completion callback + handle the next step. + All USB transfer types support that model, although there + are special cases for control URBs (which always have setup + and status stages, but may not have a data stage) and + isochronous URBs (which allow large packets and include + per-packet fault reports). + Built on top of that is synchronous API support, where a + driver calls a routine that allocates one or more URBs, + submits them, and waits until they complete. + There are synchronous wrappers for single-buffer control + and bulk transfers (which are awkward to use in some + driver disconnect scenarios), and for scatterlist based + streaming i/o (bulk or interrupt). + </para> + + <para>USB drivers need to provide buffers that can be + used for DMA, although they don't necessarily need to + provide the DMA mapping themselves. + There are APIs to use used when allocating DMA buffers, + which can prevent use of bounce buffers on some systems. + In some cases, drivers may be able to rely on 64bit DMA + to eliminate another kind of bounce buffer. + </para> + +!Edrivers/usb/core/urb.c +!Edrivers/usb/core/message.c +!Edrivers/usb/core/file.c +!Edrivers/usb/core/driver.c +!Edrivers/usb/core/usb.c +!Edrivers/usb/core/hub.c + </chapter> + + <chapter id="hcd"><title>Host Controller APIs</title> + + <para>These APIs are only for use by host controller drivers, + most of which implement standard register interfaces such as + EHCI, OHCI, or UHCI. + UHCI was one of the first interfaces, designed by Intel and + also used by VIA; it doesn't do much in hardware. + OHCI was designed later, to have the hardware do more work + (bigger transfers, tracking protocol state, and so on). + EHCI was designed with USB 2.0; its design has features that + resemble OHCI (hardware does much more work) as well as + UHCI (some parts of ISO support, TD list processing). + </para> + + <para>There are host controllers other than the "big three", + although most PCI based controllers (and a few non-PCI based + ones) use one of those interfaces. + Not all host controllers use DMA; some use PIO, and there + is also a simulator. + </para> + + <para>The same basic APIs are available to drivers for all + those controllers. + For historical reasons they are in two layers: + <structname>struct usb_bus</structname> is a rather thin + layer that became available in the 2.2 kernels, while + <structname>struct usb_hcd</structname> is a more featureful + layer (available in later 2.4 kernels and in 2.5) that + lets HCDs share common code, to shrink driver size + and significantly reduce hcd-specific behaviors. + </para> + +!Edrivers/usb/core/hcd.c +!Edrivers/usb/core/hcd-pci.c +!Idrivers/usb/core/buffer.c + </chapter> + + <chapter id="usbfs"> + <title>The USB Filesystem (usbfs)</title> + + <para>This chapter presents the Linux <emphasis>usbfs</emphasis>. + You may prefer to avoid writing new kernel code for your + USB driver; that's the problem that usbfs set out to solve. + User mode device drivers are usually packaged as applications + or libraries, and may use usbfs through some programming library + that wraps it. Such libraries include + <ulink url="http://libusb.sourceforge.net">libusb</ulink> + for C/C++, and + <ulink url="http://jUSB.sourceforge.net">jUSB</ulink> for Java. + </para> + + <note><title>Unfinished</title> + <para>This particular documentation is incomplete, + especially with respect to the asynchronous mode. + As of kernel 2.5.66 the code and this (new) documentation + need to be cross-reviewed. + </para> + </note> + + <para>Configure usbfs into Linux kernels by enabling the + <emphasis>USB filesystem</emphasis> option (CONFIG_USB_DEVICEFS), + and you get basic support for user mode USB device drivers. + Until relatively recently it was often (confusingly) called + <emphasis>usbdevfs</emphasis> although it wasn't solving what + <emphasis>devfs</emphasis> was. + Every USB device will appear in usbfs, regardless of whether or + not it has a kernel driver. + </para> + + <sect1 id="usbfs-files"> + <title>What files are in "usbfs"?</title> + + <para>Conventionally mounted at + <filename>/proc/bus/usb</filename>, usbfs + features include: + <itemizedlist> + <listitem><para><filename>/proc/bus/usb/devices</filename> + ... a text file + showing each of the USB devices on known to the kernel, + and their configuration descriptors. + You can also poll() this to learn about new devices. + </para></listitem> + <listitem><para><filename>/proc/bus/usb/BBB/DDD</filename> + ... magic files + exposing the each device's configuration descriptors, and + supporting a series of ioctls for making device requests, + including I/O to devices. (Purely for access by programs.) + </para></listitem> + </itemizedlist> + </para> + + <para> Each bus is given a number (BBB) based on when it was + enumerated; within each bus, each device is given a similar + number (DDD). + Those BBB/DDD paths are not "stable" identifiers; + expect them to change even if you always leave the devices + plugged in to the same hub port. + <emphasis>Don't even think of saving these in application + configuration files.</emphasis> + Stable identifiers are available, for user mode applications + that want to use them. HID and networking devices expose + these stable IDs, so that for example you can be sure that + you told the right UPS to power down its second server. + "usbfs" doesn't (yet) expose those IDs. + </para> + + </sect1> + + <sect1 id="usbfs-fstab"> + <title>Mounting and Access Control</title> + + <para>There are a number of mount options for usbfs, which will + be of most interest to you if you need to override the default + access control policy. + That policy is that only root may read or write device files + (<filename>/proc/bus/BBB/DDD</filename>) although anyone may read + the <filename>devices</filename> + or <filename>drivers</filename> files. + I/O requests to the device also need the CAP_SYS_RAWIO capability, + </para> + + <para>The significance of that is that by default, all user mode + device drivers need super-user privileges. + You can change modes or ownership in a driver setup + when the device hotplugs, or maye just start the + driver right then, as a privileged server (or some activity + within one). + That's the most secure approach for multi-user systems, + but for single user systems ("trusted" by that user) + it's more convenient just to grant everyone all access + (using the <emphasis>devmode=0666</emphasis> option) + so the driver can start whenever it's needed. + </para> + + <para>The mount options for usbfs, usable in /etc/fstab or + in command line invocations of <emphasis>mount</emphasis>, are: + + <variablelist> + <varlistentry> + <term><emphasis>busgid</emphasis>=NNNNN</term> + <listitem><para>Controls the GID used for the + /proc/bus/usb/BBB + directories. (Default: 0)</para></listitem></varlistentry> + <varlistentry><term><emphasis>busmode</emphasis>=MMM</term> + <listitem><para>Controls the file mode used for the + /proc/bus/usb/BBB + directories. (Default: 0555) + </para></listitem></varlistentry> + <varlistentry><term><emphasis>busuid</emphasis>=NNNNN</term> + <listitem><para>Controls the UID used for the + /proc/bus/usb/BBB + directories. (Default: 0)</para></listitem></varlistentry> + + <varlistentry><term><emphasis>devgid</emphasis>=NNNNN</term> + <listitem><para>Controls the GID used for the + /proc/bus/usb/BBB/DDD + files. (Default: 0)</para></listitem></varlistentry> + <varlistentry><term><emphasis>devmode</emphasis>=MMM</term> + <listitem><para>Controls the file mode used for the + /proc/bus/usb/BBB/DDD + files. (Default: 0644)</para></listitem></varlistentry> + <varlistentry><term><emphasis>devuid</emphasis>=NNNNN</term> + <listitem><para>Controls the UID used for the + /proc/bus/usb/BBB/DDD + files. (Default: 0)</para></listitem></varlistentry> + + <varlistentry><term><emphasis>listgid</emphasis>=NNNNN</term> + <listitem><para>Controls the GID used for the + /proc/bus/usb/devices and drivers files. + (Default: 0)</para></listitem></varlistentry> + <varlistentry><term><emphasis>listmode</emphasis>=MMM</term> + <listitem><para>Controls the file mode used for the + /proc/bus/usb/devices and drivers files. + (Default: 0444)</para></listitem></varlistentry> + <varlistentry><term><emphasis>listuid</emphasis>=NNNNN</term> + <listitem><para>Controls the UID used for the + /proc/bus/usb/devices and drivers files. + (Default: 0)</para></listitem></varlistentry> + </variablelist> + + </para> + + <para>Note that many Linux distributions hard-wire the mount options + for usbfs in their init scripts, such as + <filename>/etc/rc.d/rc.sysinit</filename>, + rather than making it easy to set this per-system + policy in <filename>/etc/fstab</filename>. + </para> + + </sect1> + + <sect1 id="usbfs-devices"> + <title>/proc/bus/usb/devices</title> + + <para>This file is handy for status viewing tools in user + mode, which can scan the text format and ignore most of it. + More detailed device status (including class and vendor + status) is available from device-specific files. + For information about the current format of this file, + see the + <filename>Documentation/usb/proc_usb_info.txt</filename> + file in your Linux kernel sources. + </para> + + <para>This file, in combination with the poll() system call, can + also be used to detect when devices are added or removed: +<programlisting>int fd; +struct pollfd pfd; + +fd = open("/proc/bus/usb/devices", O_RDONLY); +pfd = { fd, POLLIN, 0 }; +for (;;) { + /* The first time through, this call will return immediately. */ + poll(&pfd, 1, -1); + + /* To see what's changed, compare the file's previous and current + contents or scan the filesystem. (Scanning is more precise.) */ +}</programlisting> + Note that this behavior is intended to be used for informational + and debug purposes. It would be more appropriate to use programs + such as udev or HAL to initialize a device or start a user-mode + helper program, for instance. + </para> + </sect1> + + <sect1 id="usbfs-bbbddd"> + <title>/proc/bus/usb/BBB/DDD</title> + + <para>Use these files in one of these basic ways: + </para> + + <para><emphasis>They can be read,</emphasis> + producing first the device descriptor + (18 bytes) and then the descriptors for the current configuration. + See the USB 2.0 spec for details about those binary data formats. + You'll need to convert most multibyte values from little endian + format to your native host byte order, although a few of the + fields in the device descriptor (both of the BCD-encoded fields, + and the vendor and product IDs) will be byteswapped for you. + Note that configuration descriptors include descriptors for + interfaces, altsettings, endpoints, and maybe additional + class descriptors. + </para> + + <para><emphasis>Perform USB operations</emphasis> using + <emphasis>ioctl()</emphasis> requests to make endpoint I/O + requests (synchronously or asynchronously) or manage + the device. + These requests need the CAP_SYS_RAWIO capability, + as well as filesystem access permissions. + Only one ioctl request can be made on one of these + device files at a time. + This means that if you are synchronously reading an endpoint + from one thread, you won't be able to write to a different + endpoint from another thread until the read completes. + This works for <emphasis>half duplex</emphasis> protocols, + but otherwise you'd use asynchronous i/o requests. + </para> + + </sect1> + + + <sect1 id="usbfs-lifecycle"> + <title>Life Cycle of User Mode Drivers</title> + + <para>Such a driver first needs to find a device file + for a device it knows how to handle. + Maybe it was told about it because a + <filename>/sbin/hotplug</filename> event handling agent + chose that driver to handle the new device. + Or maybe it's an application that scans all the + /proc/bus/usb device files, and ignores most devices. + In either case, it should <function>read()</function> all + the descriptors from the device file, + and check them against what it knows how to handle. + It might just reject everything except a particular + vendor and product ID, or need a more complex policy. + </para> + + <para>Never assume there will only be one such device + on the system at a time! + If your code can't handle more than one device at + a time, at least detect when there's more than one, and + have your users choose which device to use. + </para> + + <para>Once your user mode driver knows what device to use, + it interacts with it in either of two styles. + The simple style is to make only control requests; some + devices don't need more complex interactions than those. + (An example might be software using vendor-specific control + requests for some initialization or configuration tasks, + with a kernel driver for the rest.) + </para> + + <para>More likely, you need a more complex style driver: + one using non-control endpoints, reading or writing data + and claiming exclusive use of an interface. + <emphasis>Bulk</emphasis> transfers are easiest to use, + but only their sibling <emphasis>interrupt</emphasis> transfers + work with low speed devices. + Both interrupt and <emphasis>isochronous</emphasis> transfers + offer service guarantees because their bandwidth is reserved. + Such "periodic" transfers are awkward to use through usbfs, + unless you're using the asynchronous calls. However, interrupt + transfers can also be used in a synchronous "one shot" style. + </para> + + <para>Your user-mode driver should never need to worry + about cleaning up request state when the device is + disconnected, although it should close its open file + descriptors as soon as it starts seeing the ENODEV + errors. + </para> + + </sect1> + + <sect1 id="usbfs-ioctl"><title>The ioctl() Requests</title> + + <para>To use these ioctls, you need to include the following + headers in your userspace program: +<programlisting>#include <linux/usb.h> +#include <linux/usbdevice_fs.h> +#include <asm/byteorder.h></programlisting> + The standard USB device model requests, from "Chapter 9" of + the USB 2.0 specification, are automatically included from + the <filename><linux/usb/ch9.h></filename> header. + </para> + + <para>Unless noted otherwise, the ioctl requests + described here will + update the modification time on the usbfs file to which + they are applied (unless they fail). + A return of zero indicates success; otherwise, a + standard USB error code is returned. (These are + documented in + <filename>Documentation/usb/error-codes.txt</filename> + in your kernel sources.) + </para> + + <para>Each of these files multiplexes access to several + I/O streams, one per endpoint. + Each device has one control endpoint (endpoint zero) + which supports a limited RPC style RPC access. + Devices are configured + by khubd (in the kernel) setting a device-wide + <emphasis>configuration</emphasis> that affects things + like power consumption and basic functionality. + The endpoints are part of USB <emphasis>interfaces</emphasis>, + which may have <emphasis>altsettings</emphasis> + affecting things like which endpoints are available. + Many devices only have a single configuration and interface, + so drivers for them will ignore configurations and altsettings. + </para> + + + <sect2 id="usbfs-mgmt"> + <title>Management/Status Requests</title> + + <para>A number of usbfs requests don't deal very directly + with device I/O. + They mostly relate to device management and status. + These are all synchronous requests. + </para> + + <variablelist> + + <varlistentry><term>USBDEVFS_CLAIMINTERFACE</term> + <listitem><para>This is used to force usbfs to + claim a specific interface, + which has not previously been claimed by usbfs or any other + kernel driver. + The ioctl parameter is an integer holding the number of + the interface (bInterfaceNumber from descriptor). + </para><para> + Note that if your driver doesn't claim an interface + before trying to use one of its endpoints, and no + other driver has bound to it, then the interface is + automatically claimed by usbfs. + </para><para> + This claim will be released by a RELEASEINTERFACE ioctl, + or by closing the file descriptor. + File modification time is not updated by this request. + </para></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_CONNECTINFO</term> + <listitem><para>Says whether the device is lowspeed. + The ioctl parameter points to a structure like this: +<programlisting>struct usbdevfs_connectinfo { + unsigned int devnum; + unsigned char slow; +}; </programlisting> + File modification time is not updated by this request. + </para><para> + <emphasis>You can't tell whether a "not slow" + device is connected at high speed (480 MBit/sec) + or just full speed (12 MBit/sec).</emphasis> + You should know the devnum value already, + it's the DDD value of the device file name. + </para></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_GETDRIVER</term> + <listitem><para>Returns the name of the kernel driver + bound to a given interface (a string). Parameter + is a pointer to this structure, which is modified: +<programlisting>struct usbdevfs_getdriver { + unsigned int interface; + char driver[USBDEVFS_MAXDRIVERNAME + 1]; +};</programlisting> + File modification time is not updated by this request. + </para></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_IOCTL</term> + <listitem><para>Passes a request from userspace through + to a kernel driver that has an ioctl entry in the + <emphasis>struct usb_driver</emphasis> it registered. +<programlisting>struct usbdevfs_ioctl { + int ifno; + int ioctl_code; + void *data; +}; + +/* user mode call looks like this. + * 'request' becomes the driver->ioctl() 'code' parameter. + * the size of 'param' is encoded in 'request', and that data + * is copied to or from the driver->ioctl() 'buf' parameter. + */ +static int +usbdev_ioctl (int fd, int ifno, unsigned request, void *param) +{ + struct usbdevfs_ioctl wrapper; + + wrapper.ifno = ifno; + wrapper.ioctl_code = request; + wrapper.data = param; + + return ioctl (fd, USBDEVFS_IOCTL, &wrapper); +} </programlisting> + File modification time is not updated by this request. + </para><para> + This request lets kernel drivers talk to user mode code + through filesystem operations even when they don't create + a charactor or block special device. + It's also been used to do things like ask devices what + device special file should be used. + Two pre-defined ioctls are used + to disconnect and reconnect kernel drivers, so + that user mode code can completely manage binding + and configuration of devices. + </para></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_RELEASEINTERFACE</term> + <listitem><para>This is used to release the claim usbfs + made on interface, either implicitly or because of a + USBDEVFS_CLAIMINTERFACE call, before the file + descriptor is closed. + The ioctl parameter is an integer holding the number of + the interface (bInterfaceNumber from descriptor); + File modification time is not updated by this request. + </para><warning><para> + <emphasis>No security check is made to ensure + that the task which made the claim is the one + which is releasing it. + This means that user mode driver may interfere + other ones. </emphasis> + </para></warning></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_RESETEP</term> + <listitem><para>Resets the data toggle value for an endpoint + (bulk or interrupt) to DATA0. + The ioctl parameter is an integer endpoint number + (1 to 15, as identified in the endpoint descriptor), + with USB_DIR_IN added if the device's endpoint sends + data to the host. + </para><warning><para> + <emphasis>Avoid using this request. + It should probably be removed.</emphasis> + Using it typically means the device and driver will lose + toggle synchronization. If you really lost synchronization, + you likely need to completely handshake with the device, + using a request like CLEAR_HALT + or SET_INTERFACE. + </para></warning></listitem></varlistentry> + + </variablelist> + + </sect2> + + <sect2 id="usbfs-sync"> + <title>Synchronous I/O Support</title> + + <para>Synchronous requests involve the kernel blocking + until the user mode request completes, either by + finishing successfully or by reporting an error. + In most cases this is the simplest way to use usbfs, + although as noted above it does prevent performing I/O + to more than one endpoint at a time. + </para> + + <variablelist> + + <varlistentry><term>USBDEVFS_BULK</term> + <listitem><para>Issues a bulk read or write request to the + device. + The ioctl parameter is a pointer to this structure: +<programlisting>struct usbdevfs_bulktransfer { + unsigned int ep; + unsigned int len; + unsigned int timeout; /* in milliseconds */ + void *data; +};</programlisting> + </para><para>The "ep" value identifies a + bulk endpoint number (1 to 15, as identified in an endpoint + descriptor), + masked with USB_DIR_IN when referring to an endpoint which + sends data to the host from the device. + The length of the data buffer is identified by "len"; + Recent kernels support requests up to about 128KBytes. + <emphasis>FIXME say how read length is returned, + and how short reads are handled.</emphasis>. + </para></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_CLEAR_HALT</term> + <listitem><para>Clears endpoint halt (stall) and + resets the endpoint toggle. This is only + meaningful for bulk or interrupt endpoints. + The ioctl parameter is an integer endpoint number + (1 to 15, as identified in an endpoint descriptor), + masked with USB_DIR_IN when referring to an endpoint which + sends data to the host from the device. + </para><para> + Use this on bulk or interrupt endpoints which have + stalled, returning <emphasis>-EPIPE</emphasis> status + to a data transfer request. + Do not issue the control request directly, since + that could invalidate the host's record of the + data toggle. + </para></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_CONTROL</term> + <listitem><para>Issues a control request to the device. + The ioctl parameter points to a structure like this: +<programlisting>struct usbdevfs_ctrltransfer { + __u8 bRequestType; + __u8 bRequest; + __u16 wValue; + __u16 wIndex; + __u16 wLength; + __u32 timeout; /* in milliseconds */ + void *data; +};</programlisting> + </para><para> + The first eight bytes of this structure are the contents + of the SETUP packet to be sent to the device; see the + USB 2.0 specification for details. + The bRequestType value is composed by combining a + USB_TYPE_* value, a USB_DIR_* value, and a + USB_RECIP_* value (from + <emphasis><linux/usb.h></emphasis>). + If wLength is nonzero, it describes the length of the data + buffer, which is either written to the device + (USB_DIR_OUT) or read from the device (USB_DIR_IN). + </para><para> + At this writing, you can't transfer more than 4 KBytes + of data to or from a device; usbfs has a limit, and + some host controller drivers have a limit. + (That's not usually a problem.) + <emphasis>Also</emphasis> there's no way to say it's + not OK to get a short read back from the device. + </para></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_RESET</term> + <listitem><para>Does a USB level device reset. + The ioctl parameter is ignored. + After the reset, this rebinds all device interfaces. + File modification time is not updated by this request. + </para><warning><para> + <emphasis>Avoid using this call</emphasis> + until some usbcore bugs get fixed, + since it does not fully synchronize device, interface, + and driver (not just usbfs) state. + </para></warning></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_SETINTERFACE</term> + <listitem><para>Sets the alternate setting for an + interface. The ioctl parameter is a pointer to a + structure like this: +<programlisting>struct usbdevfs_setinterface { + unsigned int interface; + unsigned int altsetting; +}; </programlisting> + File modification time is not updated by this request. + </para><para> + Those struct members are from some interface descriptor + applying to the current configuration. + The interface number is the bInterfaceNumber value, and + the altsetting number is the bAlternateSetting value. + (This resets each endpoint in the interface.) + </para></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_SETCONFIGURATION</term> + <listitem><para>Issues the + <function>usb_set_configuration</function> call + for the device. + The parameter is an integer holding the number of + a configuration (bConfigurationValue from descriptor). + File modification time is not updated by this request. + </para><warning><para> + <emphasis>Avoid using this call</emphasis> + until some usbcore bugs get fixed, + since it does not fully synchronize device, interface, + and driver (not just usbfs) state. + </para></warning></listitem></varlistentry> + + </variablelist> + </sect2> + + <sect2 id="usbfs-async"> + <title>Asynchronous I/O Support</title> + + <para>As mentioned above, there are situations where it may be + important to initiate concurrent operations from user mode code. + This is particularly important for periodic transfers + (interrupt and isochronous), but it can be used for other + kinds of USB requests too. + In such cases, the asynchronous requests described here + are essential. Rather than submitting one request and having + the kernel block until it completes, the blocking is separate. + </para> + + <para>These requests are packaged into a structure that + resembles the URB used by kernel device drivers. + (No POSIX Async I/O support here, sorry.) + It identifies the endpoint type (USBDEVFS_URB_TYPE_*), + endpoint (number, masked with USB_DIR_IN as appropriate), + buffer and length, and a user "context" value serving to + uniquely identify each request. + (It's usually a pointer to per-request data.) + Flags can modify requests (not as many as supported for + kernel drivers). + </para> + + <para>Each request can specify a realtime signal number + (between SIGRTMIN and SIGRTMAX, inclusive) to request a + signal be sent when the request completes. + </para> + + <para>When usbfs returns these urbs, the status value + is updated, and the buffer may have been modified. + Except for isochronous transfers, the actual_length is + updated to say how many bytes were transferred; if the + USBDEVFS_URB_DISABLE_SPD flag is set + ("short packets are not OK"), if fewer bytes were read + than were requested then you get an error report. + </para> + +<programlisting>struct usbdevfs_iso_packet_desc { + unsigned int length; + unsigned int actual_length; + unsigned int status; +}; + +struct usbdevfs_urb { + unsigned char type; + unsigned char endpoint; + int status; + unsigned int flags; + void *buffer; + int buffer_length; + int actual_length; + int start_frame; + int number_of_packets; + int error_count; + unsigned int signr; + void *usercontext; + struct usbdevfs_iso_packet_desc iso_frame_desc[]; +};</programlisting> + + <para> For these asynchronous requests, the file modification + time reflects when the request was initiated. + This contrasts with their use with the synchronous requests, + where it reflects when requests complete. + </para> + + <variablelist> + + <varlistentry><term>USBDEVFS_DISCARDURB</term> + <listitem><para> + <emphasis>TBS</emphasis> + File modification time is not updated by this request. + </para><para> + </para></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_DISCSIGNAL</term> + <listitem><para> + <emphasis>TBS</emphasis> + File modification time is not updated by this request. + </para><para> + </para></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_REAPURB</term> + <listitem><para> + <emphasis>TBS</emphasis> + File modification time is not updated by this request. + </para><para> + </para></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_REAPURBNDELAY</term> + <listitem><para> + <emphasis>TBS</emphasis> + File modification time is not updated by this request. + </para><para> + </para></listitem></varlistentry> + + <varlistentry><term>USBDEVFS_SUBMITURB</term> + <listitem><para> + <emphasis>TBS</emphasis> + </para><para> + </para></listitem></varlistentry> + + </variablelist> + </sect2> + + </sect1> + + </chapter> + +</book> +<!-- vim:syntax=sgml:sw=4 +--> diff --git a/Documentation/DocBook/wanbook.tmpl b/Documentation/DocBook/wanbook.tmpl new file mode 100644 index 0000000..8c93db1 --- /dev/null +++ b/Documentation/DocBook/wanbook.tmpl @@ -0,0 +1,99 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="WANGuide"> + <bookinfo> + <title>Synchronous PPP and Cisco HDLC Programming Guide</title> + + <authorgroup> + <author> + <firstname>Alan</firstname> + <surname>Cox</surname> + <affiliation> + <address> + <email>alan@lxorguk.ukuu.org.uk</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2000</year> + <holder>Alan Cox</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <para> + The syncppp drivers in Linux provide a fairly complete + implementation of Cisco HDLC and a minimal implementation of + PPP. The longer term goal is to switch the PPP layer to the + generic PPP interface that is new in Linux 2.3.x. The API should + remain unchanged when this is done, but support will then be + available for IPX, compression and other PPP features + </para> + </chapter> + <chapter id="bugs"> + <title>Known Bugs And Assumptions</title> + <para> + <variablelist> + <varlistentry><term>PPP is minimal</term> + <listitem> + <para> + The current PPP implementation is very basic, although sufficient + for most wan usages. + </para> + </listitem></varlistentry> + + <varlistentry><term>Cisco HDLC Quirks</term> + <listitem> + <para> + Currently we do not end all packets with the correct Cisco multicast + or unicast flags. Nothing appears to mind too much but this should + be corrected. + </para> + </listitem></varlistentry> + </variablelist> + + </para> + </chapter> + + <chapter id="pubfunctions"> + <title>Public Functions Provided</title> +!Edrivers/net/wan/syncppp.c + </chapter> + +</book> diff --git a/Documentation/DocBook/writing_usb_driver.tmpl b/Documentation/DocBook/writing_usb_driver.tmpl new file mode 100644 index 0000000..eeff19c --- /dev/null +++ b/Documentation/DocBook/writing_usb_driver.tmpl @@ -0,0 +1,412 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="USBDeviceDriver"> + <bookinfo> + <title>Writing USB Device Drivers</title> + + <authorgroup> + <author> + <firstname>Greg</firstname> + <surname>Kroah-Hartman</surname> + <affiliation> + <address> + <email>greg@kroah.com</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2001-2002</year> + <holder>Greg Kroah-Hartman</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + + <para> + This documentation is based on an article published in + Linux Journal Magazine, October 2001, Issue 90. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <para> + The Linux USB subsystem has grown from supporting only two different + types of devices in the 2.2.7 kernel (mice and keyboards), to over 20 + different types of devices in the 2.4 kernel. Linux currently supports + almost all USB class devices (standard types of devices like keyboards, + mice, modems, printers and speakers) and an ever-growing number of + vendor-specific devices (such as USB to serial converters, digital + cameras, Ethernet devices and MP3 players). For a full list of the + different USB devices currently supported, see Resources. + </para> + <para> + The remaining kinds of USB devices that do not have support on Linux are + almost all vendor-specific devices. Each vendor decides to implement a + custom protocol to talk to their device, so a custom driver usually needs + to be created. Some vendors are open with their USB protocols and help + with the creation of Linux drivers, while others do not publish them, and + developers are forced to reverse-engineer. See Resources for some links + to handy reverse-engineering tools. + </para> + <para> + Because each different protocol causes a new driver to be created, I have + written a generic USB driver skeleton, modeled after the pci-skeleton.c + file in the kernel source tree upon which many PCI network drivers have + been based. This USB skeleton can be found at drivers/usb/usb-skeleton.c + in the kernel source tree. In this article I will walk through the basics + of the skeleton driver, explaining the different pieces and what needs to + be done to customize it to your specific device. + </para> + </chapter> + + <chapter id="basics"> + <title>Linux USB Basics</title> + <para> + If you are going to write a Linux USB driver, please become familiar with + the USB protocol specification. It can be found, along with many other + useful documents, at the USB home page (see Resources). An excellent + introduction to the Linux USB subsystem can be found at the USB Working + Devices List (see Resources). It explains how the Linux USB subsystem is + structured and introduces the reader to the concept of USB urbs + (USB Request Blocks), which are essential to USB drivers. + </para> + <para> + The first thing a Linux USB driver needs to do is register itself with + the Linux USB subsystem, giving it some information about which devices + the driver supports and which functions to call when a device supported + by the driver is inserted or removed from the system. All of this + information is passed to the USB subsystem in the usb_driver structure. + The skeleton driver declares a usb_driver as: + </para> + <programlisting> +static struct usb_driver skel_driver = { + .name = "skeleton", + .probe = skel_probe, + .disconnect = skel_disconnect, + .fops = &skel_fops, + .minor = USB_SKEL_MINOR_BASE, + .id_table = skel_table, +}; + </programlisting> + <para> + The variable name is a string that describes the driver. It is used in + informational messages printed to the system log. The probe and + disconnect function pointers are called when a device that matches the + information provided in the id_table variable is either seen or removed. + </para> + <para> + The fops and minor variables are optional. Most USB drivers hook into + another kernel subsystem, such as the SCSI, network or TTY subsystem. + These types of drivers register themselves with the other kernel + subsystem, and any user-space interactions are provided through that + interface. But for drivers that do not have a matching kernel subsystem, + such as MP3 players or scanners, a method of interacting with user space + is needed. The USB subsystem provides a way to register a minor device + number and a set of file_operations function pointers that enable this + user-space interaction. The skeleton driver needs this kind of interface, + so it provides a minor starting number and a pointer to its + file_operations functions. + </para> + <para> + The USB driver is then registered with a call to usb_register, usually in + the driver's init function, as shown here: + </para> + <programlisting> +static int __init usb_skel_init(void) +{ + int result; + + /* register this driver with the USB subsystem */ + result = usb_register(&skel_driver); + if (result < 0) { + err("usb_register failed for the "__FILE__ "driver." + "Error number %d", result); + return -1; + } + + return 0; +} +module_init(usb_skel_init); + </programlisting> + <para> + When the driver is unloaded from the system, it needs to deregister + itself with the USB subsystem. This is done with the usb_deregister + function: + </para> + <programlisting> +static void __exit usb_skel_exit(void) +{ + /* deregister this driver with the USB subsystem */ + usb_deregister(&skel_driver); +} +module_exit(usb_skel_exit); + </programlisting> + <para> + To enable the linux-hotplug system to load the driver automatically when + the device is plugged in, you need to create a MODULE_DEVICE_TABLE. The + following code tells the hotplug scripts that this module supports a + single device with a specific vendor and product ID: + </para> + <programlisting> +/* table of devices that work with this driver */ +static struct usb_device_id skel_table [] = { + { USB_DEVICE(USB_SKEL_VENDOR_ID, USB_SKEL_PRODUCT_ID) }, + { } /* Terminating entry */ +}; +MODULE_DEVICE_TABLE (usb, skel_table); + </programlisting> + <para> + There are other macros that can be used in describing a usb_device_id for + drivers that support a whole class of USB drivers. See usb.h for more + information on this. + </para> + </chapter> + + <chapter id="device"> + <title>Device operation</title> + <para> + When a device is plugged into the USB bus that matches the device ID + pattern that your driver registered with the USB core, the probe function + is called. The usb_device structure, interface number and the interface ID + are passed to the function: + </para> + <programlisting> +static int skel_probe(struct usb_interface *interface, + const struct usb_device_id *id) + </programlisting> + <para> + The driver now needs to verify that this device is actually one that it + can accept. If so, it returns 0. + If not, or if any error occurs during initialization, an errorcode + (such as <literal>-ENOMEM</literal> or <literal>-ENODEV</literal>) + is returned from the probe function. + </para> + <para> + In the skeleton driver, we determine what end points are marked as bulk-in + and bulk-out. We create buffers to hold the data that will be sent and + received from the device, and a USB urb to write data to the device is + initialized. + </para> + <para> + Conversely, when the device is removed from the USB bus, the disconnect + function is called with the device pointer. The driver needs to clean any + private data that has been allocated at this time and to shut down any + pending urbs that are in the USB system. + </para> + <para> + Now that the device is plugged into the system and the driver is bound to + the device, any of the functions in the file_operations structure that + were passed to the USB subsystem will be called from a user program trying + to talk to the device. The first function called will be open, as the + program tries to open the device for I/O. We increment our private usage + count and save a pointer to our internal structure in the file + structure. This is done so that future calls to file operations will + enable the driver to determine which device the user is addressing. All + of this is done with the following code: + </para> + <programlisting> +/* increment our usage count for the module */ +++skel->open_count; + +/* save our object in the file's private structure */ +file->private_data = dev; + </programlisting> + <para> + After the open function is called, the read and write functions are called + to receive and send data to the device. In the skel_write function, we + receive a pointer to some data that the user wants to send to the device + and the size of the data. The function determines how much data it can + send to the device based on the size of the write urb it has created (this + size depends on the size of the bulk out end point that the device has). + Then it copies the data from user space to kernel space, points the urb to + the data and submits the urb to the USB subsystem. This can be seen in + the following code: + </para> + <programlisting> +/* we can only write as much as 1 urb will hold */ +bytes_written = (count > skel->bulk_out_size) ? skel->bulk_out_size : count; + +/* copy the data from user space into our urb */ +copy_from_user(skel->write_urb->transfer_buffer, buffer, bytes_written); + +/* set up our urb */ +usb_fill_bulk_urb(skel->write_urb, + skel->dev, + usb_sndbulkpipe(skel->dev, skel->bulk_out_endpointAddr), + skel->write_urb->transfer_buffer, + bytes_written, + skel_write_bulk_callback, + skel); + +/* send the data out the bulk port */ +result = usb_submit_urb(skel->write_urb); +if (result) { + err("Failed submitting write urb, error %d", result); +} + </programlisting> + <para> + When the write urb is filled up with the proper information using the + usb_fill_bulk_urb function, we point the urb's completion callback to call our + own skel_write_bulk_callback function. This function is called when the + urb is finished by the USB subsystem. The callback function is called in + interrupt context, so caution must be taken not to do very much processing + at that time. Our implementation of skel_write_bulk_callback merely + reports if the urb was completed successfully or not and then returns. + </para> + <para> + The read function works a bit differently from the write function in that + we do not use an urb to transfer data from the device to the driver. + Instead we call the usb_bulk_msg function, which can be used to send or + receive data from a device without having to create urbs and handle + urb completion callback functions. We call the usb_bulk_msg function, + giving it a buffer into which to place any data received from the device + and a timeout value. If the timeout period expires without receiving any + data from the device, the function will fail and return an error message. + This can be shown with the following code: + </para> + <programlisting> +/* do an immediate bulk read to get data from the device */ +retval = usb_bulk_msg (skel->dev, + usb_rcvbulkpipe (skel->dev, + skel->bulk_in_endpointAddr), + skel->bulk_in_buffer, + skel->bulk_in_size, + &count, HZ*10); +/* if the read was successful, copy the data to user space */ +if (!retval) { + if (copy_to_user (buffer, skel->bulk_in_buffer, count)) + retval = -EFAULT; + else + retval = count; +} + </programlisting> + <para> + The usb_bulk_msg function can be very useful for doing single reads or + writes to a device; however, if you need to read or write constantly to a + device, it is recommended to set up your own urbs and submit them to the + USB subsystem. + </para> + <para> + When the user program releases the file handle that it has been using to + talk to the device, the release function in the driver is called. In this + function we decrement our private usage count and wait for possible + pending writes: + </para> + <programlisting> +/* decrement our usage count for the device */ +--skel->open_count; + </programlisting> + <para> + One of the more difficult problems that USB drivers must be able to handle + smoothly is the fact that the USB device may be removed from the system at + any point in time, even if a program is currently talking to it. It needs + to be able to shut down any current reads and writes and notify the + user-space programs that the device is no longer there. The following + code (function <function>skel_delete</function>) + is an example of how to do this: </para> + <programlisting> +static inline void skel_delete (struct usb_skel *dev) +{ + kfree (dev->bulk_in_buffer); + if (dev->bulk_out_buffer != NULL) + usb_buffer_free (dev->udev, dev->bulk_out_size, + dev->bulk_out_buffer, + dev->write_urb->transfer_dma); + usb_free_urb (dev->write_urb); + kfree (dev); +} + </programlisting> + <para> + If a program currently has an open handle to the device, we reset the flag + <literal>device_present</literal>. For + every read, write, release and other functions that expect a device to be + present, the driver first checks this flag to see if the device is + still present. If not, it releases that the device has disappeared, and a + -ENODEV error is returned to the user-space program. When the release + function is eventually called, it determines if there is no device + and if not, it does the cleanup that the skel_disconnect + function normally does if there are no open files on the device (see + Listing 5). + </para> + </chapter> + + <chapter id="iso"> + <title>Isochronous Data</title> + <para> + This usb-skeleton driver does not have any examples of interrupt or + isochronous data being sent to or from the device. Interrupt data is sent + almost exactly as bulk data is, with a few minor exceptions. Isochronous + data works differently with continuous streams of data being sent to or + from the device. The audio and video camera drivers are very good examples + of drivers that handle isochronous data and will be useful if you also + need to do this. + </para> + </chapter> + + <chapter id="Conclusion"> + <title>Conclusion</title> + <para> + Writing Linux USB device drivers is not a difficult task as the + usb-skeleton driver shows. This driver, combined with the other current + USB drivers, should provide enough examples to help a beginning author + create a working driver in a minimal amount of time. The linux-usb-devel + mailing list archives also contain a lot of helpful information. + </para> + </chapter> + + <chapter id="resources"> + <title>Resources</title> + <para> + The Linux USB Project: <ulink url="http://www.linux-usb.org">http://www.linux-usb.org/</ulink> + </para> + <para> + Linux Hotplug Project: <ulink url="http://linux-hotplug.sourceforge.net">http://linux-hotplug.sourceforge.net/</ulink> + </para> + <para> + Linux USB Working Devices List: <ulink url="http://www.qbik.ch/usb/devices">http://www.qbik.ch/usb/devices/</ulink> + </para> + <para> + linux-usb-devel Mailing List Archives: <ulink url="http://marc.theaimsgroup.com/?l=linux-usb-devel">http://marc.theaimsgroup.com/?l=linux-usb-devel</ulink> + </para> + <para> + Programming Guide for Linux USB Device Drivers: <ulink url="http://usb.cs.tum.edu/usbdoc">http://usb.cs.tum.edu/usbdoc</ulink> + </para> + <para> + USB Home Page: <ulink url="http://www.usb.org">http://www.usb.org</ulink> + </para> + </chapter> + +</book> diff --git a/Documentation/DocBook/z8530book.tmpl b/Documentation/DocBook/z8530book.tmpl new file mode 100644 index 0000000..6f3883b --- /dev/null +++ b/Documentation/DocBook/z8530book.tmpl @@ -0,0 +1,371 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="Z85230Guide"> + <bookinfo> + <title>Z8530 Programming Guide</title> + + <authorgroup> + <author> + <firstname>Alan</firstname> + <surname>Cox</surname> + <affiliation> + <address> + <email>alan@lxorguk.ukuu.org.uk</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2000</year> + <holder>Alan Cox</holder> + </copyright> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="intro"> + <title>Introduction</title> + <para> + The Z85x30 family synchronous/asynchronous controller chips are + used on a large number of cheap network interface cards. The + kernel provides a core interface layer that is designed to make + it easy to provide WAN services using this chip. + </para> + <para> + The current driver only support synchronous operation. Merging the + asynchronous driver support into this code to allow any Z85x30 + device to be used as both a tty interface and as a synchronous + controller is a project for Linux post the 2.4 release + </para> + </chapter> + + <chapter id="Driver_Modes"> + <title>Driver Modes</title> + <para> + The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices + in three different modes. Each mode can be applied to an individual + channel on the chip (each chip has two channels). + </para> + <para> + The PIO synchronous mode supports the most common Z8530 wiring. Here + the chip is interface to the I/O and interrupt facilities of the + host machine but not to the DMA subsystem. When running PIO the + Z8530 has extremely tight timing requirements. Doing high speeds, + even with a Z85230 will be tricky. Typically you should expect to + achieve at best 9600 baud with a Z8C530 and 64Kbits with a Z85230. + </para> + <para> + The DMA mode supports the chip when it is configured to use dual DMA + channels on an ISA bus. The better cards tend to support this mode + of operation for a single channel. With DMA running the Z85230 tops + out when it starts to hit ISA DMA constraints at about 512Kbits. It + is worth noting here that many PC machines hang or crash when the + chip is driven fast enough to hold the ISA bus solid. + </para> + <para> + Transmit DMA mode uses a single DMA channel. The DMA channel is used + for transmission as the transmit FIFO is smaller than the receive + FIFO. it gives better performance than pure PIO mode but is nowhere + near as ideal as pure DMA mode. + </para> + </chapter> + + <chapter id="Using_the_Z85230_driver"> + <title>Using the Z85230 driver</title> + <para> + The Z85230 driver provides the back end interface to your board. To + configure a Z8530 interface you need to detect the board and to + identify its ports and interrupt resources. It is also your problem + to verify the resources are available. + </para> + <para> + Having identified the chip you need to fill in a struct z8530_dev, + which describes each chip. This object must exist until you finally + shutdown the board. Firstly zero the active field. This ensures + nothing goes off without you intending it. The irq field should + be set to the interrupt number of the chip. (Each chip has a single + interrupt source rather than each channel). You are responsible + for allocating the interrupt line. The interrupt handler should be + set to <function>z8530_interrupt</function>. The device id should + be set to the z8530_dev structure pointer. Whether the interrupt can + be shared or not is board dependent, and up to you to initialise. + </para> + <para> + The structure holds two channel structures. + Initialise chanA.ctrlio and chanA.dataio with the address of the + control and data ports. You can or this with Z8530_PORT_SLEEP to + indicate your interface needs the 5uS delay for chip settling done + in software. The PORT_SLEEP option is architecture specific. Other + flags may become available on future platforms, eg for MMIO. + Initialise the chanA.irqs to &z8530_nop to start the chip up + as disabled and discarding interrupt events. This ensures that + stray interrupts will be mopped up and not hang the bus. Set + chanA.dev to point to the device structure itself. The + private and name field you may use as you wish. The private field + is unused by the Z85230 layer. The name is used for error reporting + and it may thus make sense to make it match the network name. + </para> + <para> + Repeat the same operation with the B channel if your chip has + both channels wired to something useful. This isn't always the + case. If it is not wired then the I/O values do not matter, but + you must initialise chanB.dev. + </para> + <para> + If your board has DMA facilities then initialise the txdma and + rxdma fields for the relevant channels. You must also allocate the + ISA DMA channels and do any necessary board level initialisation + to configure them. The low level driver will do the Z8530 and + DMA controller programming but not board specific magic. + </para> + <para> + Having initialised the device you can then call + <function>z8530_init</function>. This will probe the chip and + reset it into a known state. An identification sequence is then + run to identify the chip type. If the checks fail to pass the + function returns a non zero error code. Typically this indicates + that the port given is not valid. After this call the + type field of the z8530_dev structure is initialised to either + Z8530, Z85C30 or Z85230 according to the chip found. + </para> + <para> + Once you have called z8530_init you can also make use of the utility + function <function>z8530_describe</function>. This provides a + consistent reporting format for the Z8530 devices, and allows all + the drivers to provide consistent reporting. + </para> + </chapter> + + <chapter id="Attaching_Network_Interfaces"> + <title>Attaching Network Interfaces</title> + <para> + If you wish to use the network interface facilities of the driver, + then you need to attach a network device to each channel that is + present and in use. In addition to use the generic HDLC + you need to follow some additional plumbing rules. They may seem + complex but a look at the example hostess_sv11 driver should + reassure you. + </para> + <para> + The network device used for each channel should be pointed to by + the netdevice field of each channel. The hdlc-> priv field of the + network device points to your private data - you will need to be + able to find your private data from this. + </para> + <para> + The way most drivers approach this particular problem is to + create a structure holding the Z8530 device definition and + put that into the private field of the network device. The + network device fields of the channels then point back to the + network devices. + </para> + <para> + If you wish to use the generic HDLC then you need to register + the HDLC device. + </para> + <para> + Before you register your network device you will also need to + provide suitable handlers for most of the network device callbacks. + See the network device documentation for more details on this. + </para> + </chapter> + + <chapter id="Configuring_And_Activating_The_Port"> + <title>Configuring And Activating The Port</title> + <para> + The Z85230 driver provides helper functions and tables to load the + port registers on the Z8530 chips. When programming the register + settings for a channel be aware that the documentation recommends + initialisation orders. Strange things happen when these are not + followed. + </para> + <para> + <function>z8530_channel_load</function> takes an array of + pairs of initialisation values in an array of u8 type. The first + value is the Z8530 register number. Add 16 to indicate the alternate + register bank on the later chips. The array is terminated by a 255. + </para> + <para> + The driver provides a pair of public tables. The + z8530_hdlc_kilostream table is for the UK 'Kilostream' service and + also happens to cover most other end host configurations. The + z8530_hdlc_kilostream_85230 table is the same configuration using + the enhancements of the 85230 chip. The configuration loaded is + standard NRZ encoded synchronous data with HDLC bitstuffing. All + of the timing is taken from the other end of the link. + </para> + <para> + When writing your own tables be aware that the driver internally + tracks register values. It may need to reload values. You should + therefore be sure to set registers 1-7, 9-11, 14 and 15 in all + configurations. Where the register settings depend on DMA selection + the driver will update the bits itself when you open or close. + Loading a new table with the interface open is not recommended. + </para> + <para> + There are three standard configurations supported by the core + code. In PIO mode the interface is programmed up to use + interrupt driven PIO. This places high demands on the host processor + to avoid latency. The driver is written to take account of latency + issues but it cannot avoid latencies caused by other drivers, + notably IDE in PIO mode. Because the drivers allocate buffers you + must also prevent MTU changes while the port is open. + </para> + <para> + Once the port is open it will call the rx_function of each channel + whenever a completed packet arrived. This is invoked from + interrupt context and passes you the channel and a network + buffer (struct sk_buff) holding the data. The data includes + the CRC bytes so most users will want to trim the last two + bytes before processing the data. This function is very timing + critical. When you wish to simply discard data the support + code provides the function <function>z8530_null_rx</function> + to discard the data. + </para> + <para> + To active PIO mode sending and receiving the <function> + z8530_sync_open</function> is called. This expects to be passed + the network device and the channel. Typically this is called from + your network device open callback. On a failure a non zero error + status is returned. The <function>z8530_sync_close</function> + function shuts down a PIO channel. This must be done before the + channel is opened again and before the driver shuts down + and unloads. + </para> + <para> + The ideal mode of operation is dual channel DMA mode. Here the + kernel driver will configure the board for DMA in both directions. + The driver also handles ISA DMA issues such as controller + programming and the memory range limit for you. This mode is + activated by calling the <function>z8530_sync_dma_open</function> + function. On failure a non zero error value is returned. + Once this mode is activated it can be shut down by calling the + <function>z8530_sync_dma_close</function>. You must call the close + function matching the open mode you used. + </para> + <para> + The final supported mode uses a single DMA channel to drive the + transmit side. As the Z85C30 has a larger FIFO on the receive + channel this tends to increase the maximum speed a little. + This is activated by calling the <function>z8530_sync_txdma_open + </function>. This returns a non zero error code on failure. The + <function>z8530_sync_txdma_close</function> function closes down + the Z8530 interface from this mode. + </para> + </chapter> + + <chapter id="Network_Layer_Functions"> + <title>Network Layer Functions</title> + <para> + The Z8530 layer provides functions to queue packets for + transmission. The driver internally buffers the frame currently + being transmitted and one further frame (in order to keep back + to back transmission running). Any further buffering is up to + the caller. + </para> + <para> + The function <function>z8530_queue_xmit</function> takes a network + buffer in sk_buff format and queues it for transmission. The + caller must provide the entire packet with the exception of the + bitstuffing and CRC. This is normally done by the caller via + the generic HDLC interface layer. It returns 0 if the buffer has been + queued and non zero values for queue full. If the function accepts + the buffer it becomes property of the Z8530 layer and the caller + should not free it. + </para> + <para> + The function <function>z8530_get_stats</function> returns a pointer + to an internally maintained per interface statistics block. This + provides most of the interface code needed to implement the network + layer get_stats callback. + </para> + </chapter> + + <chapter id="Porting_The_Z8530_Driver"> + <title>Porting The Z8530 Driver</title> + <para> + The Z8530 driver is written to be portable. In DMA mode it makes + assumptions about the use of ISA DMA. These are probably warranted + in most cases as the Z85230 in particular was designed to glue to PC + type machines. The PIO mode makes no real assumptions. + </para> + <para> + Should you need to retarget the Z8530 driver to another architecture + the only code that should need changing are the port I/O functions. + At the moment these assume PC I/O port accesses. This may not be + appropriate for all platforms. Replacing + <function>z8530_read_port</function> and <function>z8530_write_port + </function> is intended to be all that is required to port this + driver layer. + </para> + </chapter> + + <chapter id="bugs"> + <title>Known Bugs And Assumptions</title> + <para> + <variablelist> + <varlistentry><term>Interrupt Locking</term> + <listitem> + <para> + The locking in the driver is done via the global cli/sti lock. This + makes for relatively poor SMP performance. Switching this to use a + per device spin lock would probably materially improve performance. + </para> + </listitem></varlistentry> + + <varlistentry><term>Occasional Failures</term> + <listitem> + <para> + We have reports of occasional failures when run for very long + periods of time and the driver starts to receive junk frames. At + the moment the cause of this is not clear. + </para> + </listitem></varlistentry> + </variablelist> + + </para> + </chapter> + + <chapter id="pubfunctions"> + <title>Public Functions Provided</title> +!Edrivers/net/wan/z85230.c + </chapter> + + <chapter id="intfunctions"> + <title>Internal Functions</title> +!Idrivers/net/wan/z85230.c + </chapter> + +</book> |